llama_cpp 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -16,6 +16,10 @@
|
|
16
16
|
#include "ggml-opencl.h"
|
17
17
|
#endif
|
18
18
|
|
19
|
+
#ifdef GGML_USE_METAL
|
20
|
+
#include "ggml-metal.h"
|
21
|
+
#endif
|
22
|
+
|
19
23
|
#include <array>
|
20
24
|
#include <ctime>
|
21
25
|
#include <cinttypes>
|
@@ -49,17 +53,22 @@ enum e_model {
|
|
49
53
|
MODEL_65B,
|
50
54
|
};
|
51
55
|
|
52
|
-
|
53
56
|
static const size_t MB = 1024*1024;
|
54
57
|
|
55
58
|
// computed for n_ctx == 2048
|
56
59
|
// TODO: dynamically determine these sizes
|
57
60
|
// needs modifications in ggml
|
58
61
|
|
62
|
+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
63
|
+
|
64
|
+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
65
|
+
(void) tensor;
|
66
|
+
}
|
67
|
+
|
59
68
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
60
69
|
{
|
61
70
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
-
{ MODEL_3B,
|
71
|
+
{ MODEL_3B, 256ull * MB },
|
63
72
|
{ MODEL_7B, 512ull * MB },
|
64
73
|
{ MODEL_13B, 512ull * MB },
|
65
74
|
{ MODEL_30B, 512ull * MB },
|
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
71
80
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
72
81
|
{
|
73
82
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
-
{ MODEL_3B,
|
83
|
+
{ MODEL_3B, 256ull * MB },
|
75
84
|
{ MODEL_7B, 512ull * MB },
|
76
85
|
{ MODEL_13B, 512ull * MB },
|
77
86
|
{ MODEL_30B, 512ull * MB },
|
@@ -156,6 +165,11 @@ struct llama_kv_cache {
|
|
156
165
|
if (ctx) {
|
157
166
|
ggml_free(ctx);
|
158
167
|
}
|
168
|
+
|
169
|
+
#ifdef GGML_USE_CUBLAS
|
170
|
+
ggml_cuda_free_data(k);
|
171
|
+
ggml_cuda_free_data(v);
|
172
|
+
#endif // GGML_USE_CUBLAS
|
159
173
|
}
|
160
174
|
};
|
161
175
|
|
@@ -170,6 +184,7 @@ struct llama_model {
|
|
170
184
|
struct ggml_tensor * output;
|
171
185
|
|
172
186
|
std::vector<llama_layer> layers;
|
187
|
+
int n_gpu_layers;
|
173
188
|
|
174
189
|
// context
|
175
190
|
struct ggml_context * ctx = NULL;
|
@@ -195,6 +210,17 @@ struct llama_model {
|
|
195
210
|
if (ctx) {
|
196
211
|
ggml_free(ctx);
|
197
212
|
}
|
213
|
+
|
214
|
+
#ifdef GGML_USE_CUBLAS
|
215
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
216
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
217
|
+
}
|
218
|
+
ggml_cuda_free_scratch();
|
219
|
+
#elif defined(GGML_USE_CLBLAST)
|
220
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
221
|
+
ggml_cl_free_data(tensors_by_name[i].second);
|
222
|
+
}
|
223
|
+
#endif
|
198
224
|
}
|
199
225
|
};
|
200
226
|
|
@@ -243,6 +269,10 @@ struct llama_context {
|
|
243
269
|
llama_ctx_buffer buf_compute;
|
244
270
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
245
271
|
|
272
|
+
#ifdef GGML_USE_METAL
|
273
|
+
ggml_metal_context * ctx_metal = NULL;
|
274
|
+
#endif
|
275
|
+
|
246
276
|
int buf_last = 0;
|
247
277
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
248
278
|
|
@@ -282,15 +312,15 @@ template <typename T>
|
|
282
312
|
static T checked_mul(T a, T b) {
|
283
313
|
T ret = a * b;
|
284
314
|
if (a != 0 && ret / a != b) {
|
285
|
-
throw format("overflow multiplying %llu * %llu",
|
286
|
-
(unsigned long long) a, (unsigned long long) b);
|
315
|
+
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
316
|
+
(unsigned long long) a, (unsigned long long) b));
|
287
317
|
}
|
288
318
|
return ret;
|
289
319
|
}
|
290
320
|
|
291
321
|
static size_t checked_div(size_t a, size_t b) {
|
292
322
|
if (b == 0 || a % b != 0) {
|
293
|
-
throw format("error dividing %zu / %zu", a, b);
|
323
|
+
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
294
324
|
}
|
295
325
|
return a / b;
|
296
326
|
}
|
@@ -354,7 +384,7 @@ struct llama_load_tensor {
|
|
354
384
|
const auto & first_shard = shards.at(0);
|
355
385
|
for (const auto & shard : shards) {
|
356
386
|
if (shard.type != first_shard.type) {
|
357
|
-
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
387
|
+
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
358
388
|
}
|
359
389
|
}
|
360
390
|
type = first_shard.type;
|
@@ -377,8 +407,8 @@ struct llama_load_tensor {
|
|
377
407
|
const auto & first_shard = shards.at(0);
|
378
408
|
for (const auto & shard : shards) {
|
379
409
|
if (shard.ne != first_shard.ne) {
|
380
|
-
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
381
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
410
|
+
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
411
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
382
412
|
}
|
383
413
|
}
|
384
414
|
ne = first_shard.ne;
|
@@ -456,8 +486,8 @@ struct llama_file_loader {
|
|
456
486
|
}
|
457
487
|
}
|
458
488
|
|
459
|
-
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
460
|
-
magic, version);
|
489
|
+
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
490
|
+
magic, version));
|
461
491
|
}
|
462
492
|
void read_hparams() {
|
463
493
|
hparams.n_vocab = file.read_u32();
|
@@ -497,7 +527,7 @@ struct llama_file_loader {
|
|
497
527
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
498
528
|
std::string name = file.read_string(name_len);
|
499
529
|
if (n_dims < 1 || n_dims > 2) {
|
500
|
-
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
530
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
501
531
|
}
|
502
532
|
switch (shard.type) {
|
503
533
|
case GGML_TYPE_F32:
|
@@ -507,9 +537,14 @@ struct llama_file_loader {
|
|
507
537
|
case GGML_TYPE_Q5_0:
|
508
538
|
case GGML_TYPE_Q5_1:
|
509
539
|
case GGML_TYPE_Q8_0:
|
540
|
+
case GGML_TYPE_Q2_K:
|
541
|
+
case GGML_TYPE_Q3_K:
|
542
|
+
case GGML_TYPE_Q4_K:
|
543
|
+
case GGML_TYPE_Q5_K:
|
544
|
+
case GGML_TYPE_Q6_K:
|
510
545
|
break;
|
511
546
|
default: {
|
512
|
-
throw format("unrecognized tensor type %u\n", shard.type);
|
547
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
513
548
|
}
|
514
549
|
}
|
515
550
|
|
@@ -582,6 +617,11 @@ struct llama_file_saver {
|
|
582
617
|
case GGML_TYPE_Q5_0:
|
583
618
|
case GGML_TYPE_Q5_1:
|
584
619
|
case GGML_TYPE_Q8_0:
|
620
|
+
case GGML_TYPE_Q2_K:
|
621
|
+
case GGML_TYPE_Q3_K:
|
622
|
+
case GGML_TYPE_Q4_K:
|
623
|
+
case GGML_TYPE_Q5_K:
|
624
|
+
case GGML_TYPE_Q6_K:
|
585
625
|
break;
|
586
626
|
default: LLAMA_ASSERT(false);
|
587
627
|
}
|
@@ -613,7 +653,7 @@ struct llama_model_loader {
|
|
613
653
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
614
654
|
file_loaders.emplace_back(ith_file);
|
615
655
|
if (ith_file->hparams != first_file->hparams) {
|
616
|
-
throw format("llama.cpp: hparams inconsistent between files");
|
656
|
+
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
617
657
|
}
|
618
658
|
}
|
619
659
|
if (!llama_mmap::SUPPORTED) {
|
@@ -643,7 +683,7 @@ struct llama_model_loader {
|
|
643
683
|
uint32_t guess_n_parts() const {
|
644
684
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
645
685
|
if (it == tensors_map.name_to_idx.end()) {
|
646
|
-
throw std::string("missing tok_embeddings.weight");
|
686
|
+
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
647
687
|
}
|
648
688
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
649
689
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
@@ -660,12 +700,12 @@ struct llama_model_loader {
|
|
660
700
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
661
701
|
auto it = tensors_map.name_to_idx.find(name);
|
662
702
|
if (it == tensors_map.name_to_idx.end()) {
|
663
|
-
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
703
|
+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
664
704
|
}
|
665
705
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
666
706
|
if (lt.ne != ne) {
|
667
|
-
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
668
|
-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
707
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
708
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
669
709
|
}
|
670
710
|
|
671
711
|
return get_tensor_for(lt, backend);
|
@@ -673,6 +713,9 @@ struct llama_model_loader {
|
|
673
713
|
|
674
714
|
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
675
715
|
struct ggml_tensor * tensor;
|
716
|
+
if (backend != GGML_BACKEND_CPU) {
|
717
|
+
ggml_set_no_alloc(ggml_ctx, true);
|
718
|
+
}
|
676
719
|
if (lt.ne.size() == 2) {
|
677
720
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
678
721
|
} else {
|
@@ -681,6 +724,10 @@ struct llama_model_loader {
|
|
681
724
|
}
|
682
725
|
ggml_set_name(tensor, lt.name.c_str());
|
683
726
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
727
|
+
|
728
|
+
if (backend != GGML_BACKEND_CPU) {
|
729
|
+
ggml_set_no_alloc(ggml_ctx, use_mmap);
|
730
|
+
}
|
684
731
|
tensor->backend = backend;
|
685
732
|
lt.ggml_tensor = tensor;
|
686
733
|
num_ggml_tensors_created++;
|
@@ -689,13 +736,14 @@ struct llama_model_loader {
|
|
689
736
|
|
690
737
|
void done_getting_tensors() const {
|
691
738
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
692
|
-
throw std::string("llama.cpp: file contained more tensors than expected");
|
739
|
+
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
693
740
|
}
|
694
741
|
}
|
695
742
|
|
696
743
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
697
744
|
size_t data_size = 0;
|
698
745
|
size_t prefetch_size = 0;
|
746
|
+
size_t lock_size = 0;
|
699
747
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
700
748
|
data_size += lt.size;
|
701
749
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
@@ -705,11 +753,6 @@ struct llama_model_loader {
|
|
705
753
|
|
706
754
|
if (use_mmap) {
|
707
755
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
708
|
-
if (!lmlock) {
|
709
|
-
// Don't call the callback since the actual loading will be lazy
|
710
|
-
// and we can't measure it.
|
711
|
-
progress_callback = NULL;
|
712
|
-
}
|
713
756
|
if (lmlock) {
|
714
757
|
lmlock->init(mapping->addr);
|
715
758
|
}
|
@@ -717,20 +760,49 @@ struct llama_model_loader {
|
|
717
760
|
|
718
761
|
size_t done_size = 0;
|
719
762
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
720
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
721
|
-
continue;
|
722
|
-
}
|
723
763
|
if (progress_callback) {
|
724
764
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
725
765
|
}
|
726
766
|
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
727
767
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
768
|
+
|
769
|
+
// allocate temp buffer if not using mmap
|
770
|
+
if (!use_mmap && lt.data == NULL) {
|
771
|
+
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
|
772
|
+
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
|
773
|
+
}
|
774
|
+
|
728
775
|
load_data_for(lt);
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
776
|
+
|
777
|
+
switch(lt.ggml_tensor->backend) {
|
778
|
+
case GGML_BACKEND_CPU:
|
779
|
+
lt.ggml_tensor->data = lt.data;
|
780
|
+
if (use_mmap && lmlock) {
|
781
|
+
lock_size += lt.size;
|
782
|
+
lmlock->grow_to(lock_size);
|
783
|
+
}
|
784
|
+
break;
|
785
|
+
#if defined(GGML_USE_CUBLAS)
|
786
|
+
case GGML_BACKEND_GPU:
|
787
|
+
case GGML_BACKEND_GPU_SPLIT:
|
788
|
+
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
789
|
+
if (!use_mmap) {
|
790
|
+
free(lt.data);
|
791
|
+
}
|
792
|
+
break;
|
793
|
+
#elif defined(GGML_USE_CLBLAST)
|
794
|
+
case GGML_BACKEND_GPU:
|
795
|
+
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
|
796
|
+
if (!use_mmap) {
|
797
|
+
free(lt.data);
|
798
|
+
}
|
799
|
+
break;
|
800
|
+
#endif
|
801
|
+
default:
|
802
|
+
continue;
|
733
803
|
}
|
804
|
+
|
805
|
+
done_size += lt.size;
|
734
806
|
}
|
735
807
|
}
|
736
808
|
|
@@ -801,7 +873,8 @@ static bool kv_cache_init(
|
|
801
873
|
const struct llama_hparams & hparams,
|
802
874
|
struct llama_kv_cache & cache,
|
803
875
|
ggml_type wtype,
|
804
|
-
int n_ctx
|
876
|
+
int n_ctx,
|
877
|
+
int n_gpu_layers) {
|
805
878
|
const int n_embd = hparams.n_embd;
|
806
879
|
const int n_layer = hparams.n_layer;
|
807
880
|
|
@@ -827,13 +900,26 @@ static bool kv_cache_init(
|
|
827
900
|
ggml_set_name(cache.k, "cache_k");
|
828
901
|
ggml_set_name(cache.v, "cache_v");
|
829
902
|
|
903
|
+
#ifdef GGML_USE_CUBLAS
|
904
|
+
if (n_gpu_layers > n_layer + 1) {
|
905
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
906
|
+
}
|
907
|
+
if (n_gpu_layers > n_layer + 2) {
|
908
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
909
|
+
}
|
910
|
+
#endif // GGML_USE_CUBLAS
|
911
|
+
|
830
912
|
return true;
|
831
913
|
}
|
832
914
|
|
833
915
|
struct llama_context_params llama_context_default_params() {
|
834
916
|
struct llama_context_params result = {
|
835
917
|
/*.n_ctx =*/ 512,
|
918
|
+
/*.n_batch =*/ 512,
|
836
919
|
/*.gpu_layers =*/ 0,
|
920
|
+
/*.main_gpu =*/ 0,
|
921
|
+
/*.tensor_split =*/ {0},
|
922
|
+
/*.low_vram =*/ false,
|
837
923
|
/*.seed =*/ -1,
|
838
924
|
/*.f16_kv =*/ true,
|
839
925
|
/*.logits_all =*/ false,
|
@@ -848,6 +934,17 @@ struct llama_context_params llama_context_default_params() {
|
|
848
934
|
return result;
|
849
935
|
}
|
850
936
|
|
937
|
+
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
938
|
+
struct llama_model_quantize_params result = {
|
939
|
+
/*.nthread =*/ 0,
|
940
|
+
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
941
|
+
/*.allow_requantize =*/ false,
|
942
|
+
/*.quantize_output_tensor =*/ true,
|
943
|
+
};
|
944
|
+
|
945
|
+
return result;
|
946
|
+
}
|
947
|
+
|
851
948
|
bool llama_mmap_supported() {
|
852
949
|
return llama_mmap::SUPPORTED;
|
853
950
|
}
|
@@ -898,6 +995,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
898
995
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
899
996
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
900
997
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
998
|
+
// K-quants
|
999
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
1000
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
1001
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
1002
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
1003
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
1004
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
1005
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
1006
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
1007
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
901
1008
|
default: return "unknown, may not work";
|
902
1009
|
}
|
903
1010
|
}
|
@@ -917,7 +1024,11 @@ static void llama_model_load_internal(
|
|
917
1024
|
const std::string & fname,
|
918
1025
|
llama_context & lctx,
|
919
1026
|
int n_ctx,
|
1027
|
+
int n_batch,
|
920
1028
|
int n_gpu_layers,
|
1029
|
+
int main_gpu,
|
1030
|
+
const float * tensor_split,
|
1031
|
+
bool low_vram,
|
921
1032
|
ggml_type memory_type,
|
922
1033
|
bool use_mmap,
|
923
1034
|
bool use_mlock,
|
@@ -932,9 +1043,9 @@ static void llama_model_load_internal(
|
|
932
1043
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
933
1044
|
auto & model = lctx.model;
|
934
1045
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
1046
|
+
model.n_gpu_layers = n_gpu_layers;
|
935
1047
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
936
1048
|
auto & hparams = model.hparams;
|
937
|
-
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
938
1049
|
|
939
1050
|
{
|
940
1051
|
switch (hparams.n_layer) {
|
@@ -943,11 +1054,19 @@ static void llama_model_load_internal(
|
|
943
1054
|
case 40: model.type = e_model::MODEL_13B; break;
|
944
1055
|
case 60: model.type = e_model::MODEL_30B; break;
|
945
1056
|
case 80: model.type = e_model::MODEL_65B; break;
|
1057
|
+
default:
|
1058
|
+
{
|
1059
|
+
if (hparams.n_layer < 32) {
|
1060
|
+
model.type = e_model::MODEL_7B;
|
1061
|
+
}
|
1062
|
+
} break;
|
946
1063
|
}
|
947
1064
|
|
948
1065
|
hparams.n_ctx = n_ctx;
|
949
1066
|
}
|
950
1067
|
|
1068
|
+
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1069
|
+
|
951
1070
|
{
|
952
1071
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
953
1072
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
@@ -967,7 +1086,7 @@ static void llama_model_load_internal(
|
|
967
1086
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
968
1087
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
969
1088
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
970
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
1089
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
971
1090
|
}
|
972
1091
|
}
|
973
1092
|
|
@@ -975,7 +1094,7 @@ static void llama_model_load_internal(
|
|
975
1094
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
976
1095
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
977
1096
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
978
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
1097
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
979
1098
|
}
|
980
1099
|
}
|
981
1100
|
|
@@ -1006,18 +1125,28 @@ static void llama_model_load_internal(
|
|
1006
1125
|
|
1007
1126
|
model.ctx = ggml_init(params);
|
1008
1127
|
if (!model.ctx) {
|
1009
|
-
throw format("ggml_init() failed");
|
1128
|
+
throw std::runtime_error(format("ggml_init() failed"));
|
1010
1129
|
}
|
1011
1130
|
}
|
1012
1131
|
|
1013
|
-
|
1014
|
-
#
|
1132
|
+
(void) main_gpu;
|
1133
|
+
#if defined(GGML_USE_CUBLAS)
|
1134
|
+
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1135
|
+
ggml_cuda_set_main_device(main_gpu);
|
1136
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1137
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1138
|
+
#elif defined(GGML_USE_CLBLAST)
|
1139
|
+
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
1140
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1141
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1015
1142
|
#else
|
1016
|
-
#define LLAMA_BACKEND_OFFLOAD
|
1143
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1144
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
1017
1145
|
#endif
|
1018
1146
|
|
1019
1147
|
// prepare memory for the weights
|
1020
|
-
size_t
|
1148
|
+
size_t vram_weights = 0;
|
1149
|
+
size_t vram_scratch = 0;
|
1021
1150
|
{
|
1022
1151
|
const uint32_t n_embd = hparams.n_embd;
|
1023
1152
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -1026,25 +1155,42 @@ static void llama_model_load_internal(
|
|
1026
1155
|
ml->ggml_ctx = ctx;
|
1027
1156
|
|
1028
1157
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1029
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1030
1158
|
|
1031
1159
|
// "output" tensor
|
1032
1160
|
{
|
1161
|
+
ggml_backend backend_norm;
|
1033
1162
|
ggml_backend backend_output;
|
1034
1163
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1035
|
-
|
1164
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1165
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
1166
|
+
#ifndef _WIN32
|
1167
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1168
|
+
#else
|
1169
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1170
|
+
#endif // _WIN32
|
1171
|
+
|
1172
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1036
1173
|
} else {
|
1174
|
+
backend_norm = GGML_BACKEND_CPU;
|
1037
1175
|
backend_output = GGML_BACKEND_CPU;
|
1038
1176
|
}
|
1039
1177
|
|
1178
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
|
1040
1179
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1180
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
1181
|
+
vram_weights += ggml_nbytes(model.norm);
|
1182
|
+
}
|
1183
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
1184
|
+
vram_weights += ggml_nbytes(model.output);
|
1185
|
+
}
|
1041
1186
|
}
|
1042
1187
|
|
1043
1188
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1044
1189
|
|
1045
1190
|
model.layers.resize(n_layer);
|
1046
1191
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1047
|
-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1192
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1193
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1048
1194
|
|
1049
1195
|
auto & layer = model.layers[i];
|
1050
1196
|
|
@@ -1052,21 +1198,21 @@ static void llama_model_load_internal(
|
|
1052
1198
|
|
1053
1199
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1054
1200
|
|
1055
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1056
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd},
|
1057
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd},
|
1058
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1201
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1202
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
1203
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
1204
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1059
1205
|
|
1060
1206
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1061
1207
|
|
1062
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1063
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd},
|
1064
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1208
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1209
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1210
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1065
1211
|
|
1066
|
-
if (backend ==
|
1067
|
-
|
1212
|
+
if (backend == GGML_BACKEND_GPU) {
|
1213
|
+
vram_weights +=
|
1068
1214
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1069
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.
|
1215
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
1070
1216
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1071
1217
|
}
|
1072
1218
|
}
|
@@ -1081,10 +1227,10 @@ static void llama_model_load_internal(
|
|
1081
1227
|
// this is the total memory required to run the inference
|
1082
1228
|
const size_t mem_required =
|
1083
1229
|
ctx_size +
|
1084
|
-
mmapped_size -
|
1230
|
+
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1085
1231
|
MEM_REQ_SCRATCH0().at(model.type) +
|
1086
1232
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1087
|
-
MEM_REQ_EVAL().at(model.type);
|
1233
|
+
MEM_REQ_EVAL().at (model.type);
|
1088
1234
|
|
1089
1235
|
// this is the memory required by one llama_state
|
1090
1236
|
const size_t mem_required_state =
|
@@ -1093,15 +1239,51 @@ static void llama_model_load_internal(
|
|
1093
1239
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1094
1240
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1095
1241
|
|
1242
|
+
(void) vram_scratch;
|
1243
|
+
(void) n_batch;
|
1096
1244
|
#ifdef GGML_USE_CUBLAS
|
1245
|
+
if (low_vram) {
|
1246
|
+
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1247
|
+
ggml_cuda_set_scratch_size(0); // disable scratch
|
1248
|
+
} else {
|
1249
|
+
vram_scratch = n_batch * MB;
|
1250
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
|
+
if (n_gpu_layers > 0) {
|
1252
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1253
|
+
__func__, vram_scratch / MB);
|
1254
|
+
}
|
1255
|
+
}
|
1256
|
+
#endif // GGML_USE_CUBLAS
|
1257
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1097
1258
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1098
1259
|
|
1099
|
-
fprintf(stderr, "%s:
|
1260
|
+
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1100
1261
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1101
|
-
fprintf(stderr, "%s:
|
1262
|
+
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1263
|
+
}
|
1264
|
+
size_t vram_kv_cache = 0;
|
1265
|
+
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1266
|
+
if (low_vram) {
|
1267
|
+
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1268
|
+
} else {
|
1269
|
+
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1270
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1271
|
+
}
|
1102
1272
|
}
|
1103
|
-
|
1104
|
-
|
1273
|
+
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1274
|
+
if (low_vram) {
|
1275
|
+
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1276
|
+
} else {
|
1277
|
+
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1278
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1279
|
+
}
|
1280
|
+
}
|
1281
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1282
|
+
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1283
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
1284
|
+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1285
|
+
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1286
|
+
#else
|
1105
1287
|
(void) n_gpu_layers;
|
1106
1288
|
#endif
|
1107
1289
|
}
|
@@ -1111,57 +1293,15 @@ static void llama_model_load_internal(
|
|
1111
1293
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
1112
1294
|
}
|
1113
1295
|
|
1114
|
-
|
1115
|
-
|
1116
|
-
#ifdef GGML_USE_CUBLAS
|
1296
|
+
(void) tensor_split;
|
1297
|
+
#if defined(GGML_USE_CUBLAS)
|
1117
1298
|
{
|
1118
|
-
|
1119
|
-
size_t data_size = 0;
|
1120
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1121
|
-
data_size += lt.size;
|
1122
|
-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1123
|
-
done_size += lt.size;
|
1124
|
-
}
|
1125
|
-
}
|
1126
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1127
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1128
|
-
continue;
|
1129
|
-
}
|
1130
|
-
if (progress_callback) {
|
1131
|
-
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1132
|
-
}
|
1133
|
-
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1134
|
-
done_size += lt.size;
|
1135
|
-
}
|
1136
|
-
}
|
1137
|
-
#elif defined(GGML_USE_CLBLAST)
|
1138
|
-
{
|
1139
|
-
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1140
|
-
|
1141
|
-
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1142
|
-
|
1143
|
-
size_t vram_total = 0;
|
1144
|
-
|
1145
|
-
for (int i = 0; i < n_gpu; ++i) {
|
1146
|
-
const auto & layer = model.layers[i];
|
1147
|
-
|
1148
|
-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1149
|
-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1150
|
-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1151
|
-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1152
|
-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1153
|
-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1154
|
-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1155
|
-
}
|
1156
|
-
if (n_gpu_layers > (int) hparams.n_layer) {
|
1157
|
-
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1158
|
-
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1159
|
-
}
|
1160
|
-
|
1161
|
-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1299
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
1162
1300
|
}
|
1163
1301
|
#endif
|
1164
1302
|
|
1303
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1304
|
+
|
1165
1305
|
if (progress_callback) {
|
1166
1306
|
progress_callback(1.0f, progress_callback_user_data);
|
1167
1307
|
}
|
@@ -1177,7 +1317,11 @@ static bool llama_model_load(
|
|
1177
1317
|
const std::string & fname,
|
1178
1318
|
llama_context & lctx,
|
1179
1319
|
int n_ctx,
|
1320
|
+
int n_batch,
|
1180
1321
|
int n_gpu_layers,
|
1322
|
+
int main_gpu,
|
1323
|
+
float * tensor_split,
|
1324
|
+
bool low_vram,
|
1181
1325
|
ggml_type memory_type,
|
1182
1326
|
bool use_mmap,
|
1183
1327
|
bool use_mlock,
|
@@ -1185,28 +1329,30 @@ static bool llama_model_load(
|
|
1185
1329
|
llama_progress_callback progress_callback,
|
1186
1330
|
void *progress_callback_user_data) {
|
1187
1331
|
try {
|
1188
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers,
|
1189
|
-
vocab_only, progress_callback, progress_callback_user_data);
|
1332
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1333
|
+
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1190
1334
|
return true;
|
1191
|
-
} catch (const std::
|
1192
|
-
fprintf(stderr, "error loading model: %s\n", err.
|
1335
|
+
} catch (const std::exception & err) {
|
1336
|
+
fprintf(stderr, "error loading model: %s\n", err.what());
|
1193
1337
|
return false;
|
1194
1338
|
}
|
1195
1339
|
}
|
1196
1340
|
|
1197
1341
|
// evaluate the transformer
|
1198
1342
|
//
|
1199
|
-
// - lctx:
|
1200
|
-
// - tokens:
|
1201
|
-
// - n_past:
|
1202
|
-
// - n_threads:
|
1343
|
+
// - lctx: llama context
|
1344
|
+
// - tokens: new batch of tokens to process
|
1345
|
+
// - n_past: the context size so far
|
1346
|
+
// - n_threads: number of threads to use
|
1347
|
+
// - cgraph_fname: filename of the exported computation graph
|
1203
1348
|
//
|
1204
1349
|
static bool llama_eval_internal(
|
1205
|
-
llama_context &
|
1206
|
-
const llama_token *
|
1207
|
-
const int
|
1208
|
-
const int
|
1209
|
-
const int
|
1350
|
+
llama_context & lctx,
|
1351
|
+
const llama_token * tokens,
|
1352
|
+
const int n_tokens,
|
1353
|
+
const int n_past,
|
1354
|
+
const int n_threads,
|
1355
|
+
const char * cgraph_fname) {
|
1210
1356
|
|
1211
1357
|
// enforce that the first token is BOS
|
1212
1358
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
@@ -1225,12 +1371,13 @@ static bool llama_eval_internal(
|
|
1225
1371
|
|
1226
1372
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1227
1373
|
|
1228
|
-
const int n_embd
|
1229
|
-
const int n_layer
|
1230
|
-
const int n_ctx
|
1231
|
-
const int n_head
|
1232
|
-
const int n_vocab
|
1233
|
-
const int n_rot
|
1374
|
+
const int n_embd = hparams.n_embd;
|
1375
|
+
const int n_layer = hparams.n_layer;
|
1376
|
+
const int n_ctx = hparams.n_ctx;
|
1377
|
+
const int n_head = hparams.n_head;
|
1378
|
+
const int n_vocab = hparams.n_vocab;
|
1379
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
1380
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1234
1381
|
|
1235
1382
|
auto & mem_per_token = lctx.mem_per_token;
|
1236
1383
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1252,40 +1399,98 @@ static bool llama_eval_internal(
|
|
1252
1399
|
ggml_set_name(embd, "embd");
|
1253
1400
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1254
1401
|
|
1402
|
+
struct ggml_tensor * cur;
|
1255
1403
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1256
1404
|
|
1405
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
1406
|
+
(void) i_gpu_start;
|
1407
|
+
|
1408
|
+
// offload functions set the tensor output backend to GPU
|
1409
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
1410
|
+
//
|
1411
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
1412
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
1413
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
1414
|
+
offload_func_t offload_func_kq = llama_nop;
|
1415
|
+
offload_func_t offload_func_v = llama_nop;
|
1416
|
+
|
1417
|
+
#ifdef GGML_USE_CUBLAS
|
1418
|
+
if (n_gpu_layers > n_layer) {
|
1419
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1420
|
+
}
|
1421
|
+
if (n_gpu_layers > n_layer + 1) {
|
1422
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1423
|
+
}
|
1424
|
+
if (n_gpu_layers > n_layer + 2) {
|
1425
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1426
|
+
}
|
1427
|
+
#endif // GGML_USE_CUBLAS
|
1428
|
+
|
1257
1429
|
for (int il = 0; il < n_layer; ++il) {
|
1258
|
-
|
1430
|
+
offload_func_t offload_func = llama_nop;
|
1431
|
+
|
1432
|
+
#ifdef GGML_USE_CUBLAS
|
1433
|
+
if (il >= i_gpu_start) {
|
1434
|
+
offload_func = ggml_cuda_assign_buffers;
|
1435
|
+
}
|
1436
|
+
#endif // GGML_USE_CUBLAS
|
1259
1437
|
|
1260
|
-
struct ggml_tensor *
|
1438
|
+
struct ggml_tensor * inpSA = inpL;
|
1261
1439
|
|
1262
1440
|
lctx.use_buf(ctx0, 0);
|
1263
1441
|
|
1264
1442
|
// norm
|
1265
1443
|
{
|
1266
1444
|
cur = ggml_rms_norm(ctx0, inpL);
|
1445
|
+
offload_func(cur);
|
1446
|
+
ggml_set_name(cur, "rms_norm_0");
|
1267
1447
|
|
1268
1448
|
// cur = cur*attention_norm(broadcasted)
|
1269
1449
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1450
|
+
offload_func(cur);
|
1451
|
+
ggml_set_name(cur, "attention_norm_0");
|
1270
1452
|
}
|
1271
1453
|
|
1272
1454
|
// self-attention
|
1273
1455
|
{
|
1274
1456
|
// compute Q and K and RoPE them
|
1275
|
-
struct ggml_tensor *
|
1276
|
-
|
1277
|
-
ggml_set_name(
|
1457
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1458
|
+
offload_func_kq(tmpk);
|
1459
|
+
ggml_set_name(tmpk, "tmpk");
|
1460
|
+
|
1461
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1462
|
+
offload_func_kq(tmpq);
|
1463
|
+
ggml_set_name(tmpq, "tmpq");
|
1464
|
+
|
1465
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1466
|
+
offload_func_kq(Kcur);
|
1278
1467
|
ggml_set_name(Kcur, "Kcur");
|
1279
1468
|
|
1469
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1470
|
+
offload_func_kq(Qcur);
|
1471
|
+
ggml_set_name(Qcur, "Qcur");
|
1472
|
+
|
1280
1473
|
// store key and value to memory
|
1281
1474
|
{
|
1282
1475
|
// compute the transposed [N, n_embd] V matrix
|
1283
|
-
|
1476
|
+
|
1477
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
1478
|
+
offload_func_v(tmpv);
|
1479
|
+
ggml_set_name(tmpv, "tmpv");
|
1480
|
+
|
1481
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
|
1482
|
+
offload_func_v(Vcur);
|
1483
|
+
ggml_set_name(Vcur, "Vcur");
|
1284
1484
|
|
1285
1485
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1486
|
+
offload_func_kq(k);
|
1487
|
+
ggml_set_name(k, "k");
|
1488
|
+
|
1286
1489
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1287
1490
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1288
1491
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1492
|
+
offload_func_v(v);
|
1493
|
+
ggml_set_name(v, "v");
|
1289
1494
|
|
1290
1495
|
// important: storing RoPE-ed version of K in the KV cache!
|
1291
1496
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -1296,6 +1501,7 @@ static bool llama_eval_internal(
|
|
1296
1501
|
ggml_permute(ctx0,
|
1297
1502
|
Qcur,
|
1298
1503
|
0, 2, 1, 3);
|
1504
|
+
offload_func_kq(Q);
|
1299
1505
|
ggml_set_name(Q, "Q");
|
1300
1506
|
|
1301
1507
|
struct ggml_tensor * K =
|
@@ -1304,10 +1510,12 @@ static bool llama_eval_internal(
|
|
1304
1510
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1305
1511
|
n_embd/n_head, n_head, n_past + N),
|
1306
1512
|
0, 2, 1, 3);
|
1513
|
+
offload_func_kq(K);
|
1307
1514
|
ggml_set_name(K, "K");
|
1308
1515
|
|
1309
1516
|
// K * Q
|
1310
1517
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1518
|
+
offload_func_kq(KQ);
|
1311
1519
|
ggml_set_name(KQ, "KQ");
|
1312
1520
|
|
1313
1521
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
@@ -1316,17 +1524,19 @@ static bool llama_eval_internal(
|
|
1316
1524
|
|
1317
1525
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1318
1526
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1527
|
+
offload_func_kq(KQ_scaled);
|
1319
1528
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1320
1529
|
|
1321
1530
|
// KQ_masked = mask_past(KQ_scaled)
|
1322
1531
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1532
|
+
offload_func_kq(KQ_masked);
|
1323
1533
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1324
1534
|
|
1325
1535
|
// KQ = soft_max(KQ_masked)
|
1326
1536
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1537
|
+
offload_func_v(KQ_soft_max);
|
1327
1538
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1328
1539
|
|
1329
|
-
|
1330
1540
|
// split cached V into n_head heads
|
1331
1541
|
struct ggml_tensor * V =
|
1332
1542
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1334,10 +1544,12 @@ static bool llama_eval_internal(
|
|
1334
1544
|
n_ctx*ggml_element_size(kv_self.v),
|
1335
1545
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1336
1546
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1547
|
+
offload_func_v(V);
|
1337
1548
|
ggml_set_name(V, "V");
|
1338
1549
|
|
1339
1550
|
#if 1
|
1340
1551
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1552
|
+
offload_func_v(KQV);
|
1341
1553
|
ggml_set_name(KQV, "KQV");
|
1342
1554
|
#else
|
1343
1555
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
@@ -1349,56 +1561,79 @@ static bool llama_eval_internal(
|
|
1349
1561
|
|
1350
1562
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1351
1563
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1564
|
+
offload_func_v(KQV_merged);
|
1352
1565
|
ggml_set_name(KQV_merged, "KQV_merged");
|
1353
1566
|
|
1354
1567
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1355
1568
|
cur = ggml_cpy(ctx0,
|
1356
1569
|
KQV_merged,
|
1357
1570
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1571
|
+
offload_func_v(cur);
|
1358
1572
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
1359
1573
|
|
1360
1574
|
// projection (no bias)
|
1361
1575
|
cur = ggml_mul_mat(ctx0,
|
1362
1576
|
model.layers[il].wo,
|
1363
1577
|
cur);
|
1578
|
+
offload_func(cur);
|
1579
|
+
ggml_set_name(cur, "result_wo");
|
1364
1580
|
}
|
1365
1581
|
|
1366
1582
|
lctx.use_buf(ctx0, 1);
|
1367
1583
|
|
1368
1584
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1585
|
+
offload_func(inpFF);
|
1586
|
+
ggml_set_name(inpFF, "inpFF");
|
1369
1587
|
|
1370
1588
|
// feed-forward network
|
1371
1589
|
{
|
1372
1590
|
// norm
|
1373
1591
|
{
|
1374
1592
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1593
|
+
offload_func(cur);
|
1594
|
+
ggml_set_name(cur, "rms_norm_1");
|
1375
1595
|
|
1376
1596
|
// cur = cur*ffn_norm(broadcasted)
|
1377
1597
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1598
|
+
offload_func(cur);
|
1599
|
+
ggml_set_name(cur, "ffn_norm");
|
1378
1600
|
}
|
1379
1601
|
|
1380
1602
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
1381
1603
|
model.layers[il].w3,
|
1382
1604
|
cur);
|
1605
|
+
offload_func(tmp);
|
1606
|
+
ggml_set_name(tmp, "result_w3");
|
1383
1607
|
|
1384
1608
|
cur = ggml_mul_mat(ctx0,
|
1385
1609
|
model.layers[il].w1,
|
1386
1610
|
cur);
|
1611
|
+
offload_func(cur);
|
1612
|
+
ggml_set_name(cur, "result_w2");
|
1387
1613
|
|
1388
1614
|
// SILU activation
|
1389
1615
|
cur = ggml_silu(ctx0, cur);
|
1616
|
+
offload_func(cur);
|
1617
|
+
ggml_set_name(cur, "silu");
|
1390
1618
|
|
1391
1619
|
cur = ggml_mul(ctx0, cur, tmp);
|
1620
|
+
offload_func(cur);
|
1621
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
1392
1622
|
|
1393
1623
|
cur = ggml_mul_mat(ctx0,
|
1394
1624
|
model.layers[il].w2,
|
1395
1625
|
cur);
|
1626
|
+
offload_func(cur);
|
1627
|
+
ggml_set_name(cur, "result_w2");
|
1396
1628
|
}
|
1397
1629
|
|
1398
1630
|
cur = ggml_add(ctx0, cur, inpFF);
|
1631
|
+
offload_func(cur);
|
1632
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
1399
1633
|
|
1400
1634
|
// input for next layer
|
1401
1635
|
inpL = cur;
|
1636
|
+
|
1402
1637
|
}
|
1403
1638
|
|
1404
1639
|
lctx.use_buf(ctx0, 0);
|
@@ -1406,28 +1641,68 @@ static bool llama_eval_internal(
|
|
1406
1641
|
// used at the end to optionally extract the embeddings
|
1407
1642
|
struct ggml_tensor * embeddings = NULL;
|
1408
1643
|
|
1644
|
+
|
1409
1645
|
// norm
|
1410
1646
|
{
|
1647
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
1648
|
+
offload_func_nr(cur);
|
1649
|
+
ggml_set_name(cur, "rms_norm_inpL");
|
1411
1650
|
|
1412
|
-
|
1651
|
+
cur = ggml_rms_norm(ctx0, cur);
|
1652
|
+
offload_func_nr(cur);
|
1653
|
+
ggml_set_name(cur, "rms_norm_after");
|
1413
1654
|
|
1414
|
-
//
|
1415
|
-
|
1655
|
+
// cur = cur*norm(broadcasted)
|
1656
|
+
cur = ggml_mul(ctx0, cur, model.norm);
|
1657
|
+
offload_func_nr(cur);
|
1658
|
+
ggml_set_name(cur, "result_norm");
|
1416
1659
|
|
1417
|
-
embeddings =
|
1660
|
+
embeddings = cur;
|
1418
1661
|
}
|
1419
1662
|
|
1663
|
+
|
1420
1664
|
// lm_head
|
1421
|
-
|
1665
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1666
|
+
ggml_set_name(cur, "result_output");
|
1422
1667
|
|
1423
1668
|
lctx.use_buf(ctx0, -1);
|
1424
1669
|
|
1425
1670
|
// logits -> probs
|
1426
|
-
//
|
1671
|
+
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1427
1672
|
|
1428
1673
|
// run the computation
|
1429
|
-
ggml_build_forward_expand(&gf,
|
1430
|
-
|
1674
|
+
ggml_build_forward_expand(&gf, cur);
|
1675
|
+
|
1676
|
+
#ifdef GGML_USE_METAL
|
1677
|
+
if (lctx.ctx_metal && N == 1) {
|
1678
|
+
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1679
|
+
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1680
|
+
} else {
|
1681
|
+
// IMPORTANT:
|
1682
|
+
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1683
|
+
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1684
|
+
// coprocessor.
|
1685
|
+
//
|
1686
|
+
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1687
|
+
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1688
|
+
//
|
1689
|
+
// TODO: avoid these syncs via shared memory (ref #1696)
|
1690
|
+
//
|
1691
|
+
if (lctx.ctx_metal) {
|
1692
|
+
// We need to sync the GPU KV cache with the CPU KV cache
|
1693
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1694
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1695
|
+
}
|
1696
|
+
|
1697
|
+
ggml_graph_compute(ctx0, &gf);
|
1698
|
+
}
|
1699
|
+
#else
|
1700
|
+
ggml_graph_compute(ctx0, &gf);
|
1701
|
+
#endif
|
1702
|
+
|
1703
|
+
if (cgraph_fname) {
|
1704
|
+
ggml_graph_export(&gf, cgraph_fname);
|
1705
|
+
}
|
1431
1706
|
|
1432
1707
|
#ifdef GGML_PERF
|
1433
1708
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -1441,7 +1716,7 @@ static bool llama_eval_internal(
|
|
1441
1716
|
//}
|
1442
1717
|
|
1443
1718
|
//embd_w.resize(n_vocab*N);
|
1444
|
-
//memcpy(embd_w.data(), ggml_get_data(
|
1719
|
+
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1445
1720
|
|
1446
1721
|
// update kv token count
|
1447
1722
|
lctx.model.kv_self.n = n_past + N;
|
@@ -1452,11 +1727,11 @@ static bool llama_eval_internal(
|
|
1452
1727
|
|
1453
1728
|
if (lctx.logits_all) {
|
1454
1729
|
logits_out.resize(n_vocab * N);
|
1455
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1730
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1456
1731
|
} else {
|
1457
1732
|
// return result for just the last token
|
1458
1733
|
logits_out.resize(n_vocab);
|
1459
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1734
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1460
1735
|
}
|
1461
1736
|
}
|
1462
1737
|
|
@@ -1987,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
1987
2262
|
return -log2f(candidate.p) > *mu;
|
1988
2263
|
}));
|
1989
2264
|
|
2265
|
+
if (candidates->size == 0) {
|
2266
|
+
candidates->size = 1;
|
2267
|
+
}
|
2268
|
+
|
1990
2269
|
// Normalize the probabilities of the remaining words
|
1991
2270
|
llama_sample_softmax(ctx, candidates);
|
1992
2271
|
|
@@ -2055,16 +2334,92 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2055
2334
|
// quantization
|
2056
2335
|
//
|
2057
2336
|
|
2058
|
-
static void
|
2337
|
+
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
2338
|
+
if (output.size < nelements * sizeof(float)) {
|
2339
|
+
output.resize(nelements * sizeof(float));
|
2340
|
+
}
|
2341
|
+
float * f32_output = (float *) output.addr;
|
2342
|
+
|
2343
|
+
quantize_fns_t qtype;
|
2344
|
+
if (ggml_is_quantized(tensor.type)) {
|
2345
|
+
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
2346
|
+
if (qtype.dequantize_row_q == NULL) {
|
2347
|
+
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2348
|
+
}
|
2349
|
+
} else if (tensor.type != GGML_TYPE_F16) {
|
2350
|
+
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
2351
|
+
}
|
2352
|
+
|
2353
|
+
if (nthread < 2) {
|
2354
|
+
if (tensor.type == GGML_TYPE_F16) {
|
2355
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2356
|
+
} else if (ggml_is_quantized(tensor.type)) {
|
2357
|
+
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
2358
|
+
} else {
|
2359
|
+
LLAMA_ASSERT(false); // unreachable
|
2360
|
+
}
|
2361
|
+
return;
|
2362
|
+
}
|
2363
|
+
|
2364
|
+
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
2365
|
+
auto block_size_bytes = ggml_type_size(tensor.type);
|
2366
|
+
|
2367
|
+
LLAMA_ASSERT(nelements % block_size == 0);
|
2368
|
+
auto nblocks = nelements / block_size;
|
2369
|
+
auto blocks_per_thread = nblocks / nthread;
|
2370
|
+
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
2371
|
+
|
2372
|
+
std::vector<std::thread> workers;
|
2373
|
+
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
2374
|
+
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
2375
|
+
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
2376
|
+
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
2377
|
+
|
2378
|
+
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
2379
|
+
if (typ == GGML_TYPE_F16) {
|
2380
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2381
|
+
} else {
|
2382
|
+
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
2383
|
+
}
|
2384
|
+
};
|
2385
|
+
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
2386
|
+
in_buff_offs += thr_block_bytes;
|
2387
|
+
out_buff_offs += thr_elems;
|
2388
|
+
}
|
2389
|
+
for (auto & worker : workers) {
|
2390
|
+
worker.join();
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
}
|
2394
|
+
|
2395
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
2059
2396
|
ggml_type quantized_type;
|
2060
|
-
|
2397
|
+
llama_ftype ftype = params->ftype;
|
2398
|
+
int nthread = params->nthread;
|
2399
|
+
|
2400
|
+
switch (params->ftype) {
|
2061
2401
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
2062
2402
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
2063
2403
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2064
2404
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2065
2405
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2066
|
-
|
2067
|
-
|
2406
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2407
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2408
|
+
|
2409
|
+
#ifdef GGML_USE_K_QUANTS
|
2410
|
+
// K-quants
|
2411
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2412
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
2413
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
2414
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
2415
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
2416
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
2417
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2418
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2419
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2420
|
+
#endif
|
2421
|
+
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2422
|
+
}
|
2068
2423
|
|
2069
2424
|
if (nthread <= 0) {
|
2070
2425
|
nthread = std::thread::hardware_concurrency();
|
@@ -2072,7 +2427,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2072
2427
|
|
2073
2428
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
2074
2429
|
/*vocab_only*/ false));
|
2075
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
2430
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2431
|
+
|
2432
|
+
#ifdef GGML_USE_K_QUANTS
|
2433
|
+
int n_attention_wv = 0;
|
2434
|
+
int n_feed_forward_w2 = 0;
|
2435
|
+
for (auto& tensor : model_loader->tensors_map.tensors) {
|
2436
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2437
|
+
++n_attention_wv;
|
2438
|
+
}
|
2439
|
+
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2440
|
+
++n_feed_forward_w2;
|
2441
|
+
}
|
2442
|
+
}
|
2443
|
+
|
2444
|
+
int i_attention_wv = 0;
|
2445
|
+
int i_feed_forward_w2 = 0;
|
2446
|
+
#endif
|
2076
2447
|
|
2077
2448
|
size_t total_size_org = 0;
|
2078
2449
|
size_t total_size_new = 0;
|
@@ -2098,11 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2098
2469
|
|
2099
2470
|
// quantize only 2D tensors
|
2100
2471
|
quantize &= (tensor.ne.size() == 2);
|
2101
|
-
|
2102
|
-
|
2103
|
-
//if (tensor.name == "output.weight") {
|
2104
|
-
// quantize = false;
|
2105
|
-
//}
|
2472
|
+
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
2473
|
+
quantize &= quantized_type != tensor.type;
|
2106
2474
|
|
2107
2475
|
enum ggml_type new_type;
|
2108
2476
|
void * new_data;
|
@@ -2116,20 +2484,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2116
2484
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2117
2485
|
} else {
|
2118
2486
|
new_type = quantized_type;
|
2487
|
+
#ifdef GGML_USE_K_QUANTS
|
2488
|
+
if (tensor.name == "output.weight") {
|
2489
|
+
new_type = GGML_TYPE_Q6_K;
|
2490
|
+
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2491
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2492
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2493
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2494
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2495
|
+
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2496
|
+
++i_attention_wv;
|
2497
|
+
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2498
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2499
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2500
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2501
|
+
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2502
|
+
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2503
|
+
++i_feed_forward_w2;
|
2504
|
+
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2505
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2506
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2507
|
+
}
|
2508
|
+
#endif
|
2509
|
+
|
2119
2510
|
float * f32_data;
|
2120
2511
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
2121
2512
|
llama_buffer f32_conv_buf;
|
2513
|
+
|
2122
2514
|
if (tensor.type == GGML_TYPE_F32) {
|
2123
2515
|
f32_data = (float *) tensor.data;
|
2124
|
-
} else if (tensor.type
|
2125
|
-
|
2126
|
-
f32_data = (float *) f32_conv_buf.addr;
|
2127
|
-
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
2128
|
-
for (size_t i = 0; i < nelements; i++) {
|
2129
|
-
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
2130
|
-
}
|
2516
|
+
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
2517
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
2131
2518
|
} else {
|
2132
|
-
|
2519
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
2520
|
+
f32_data = (float *) f32_conv_buf.addr;
|
2133
2521
|
}
|
2134
2522
|
|
2135
2523
|
printf("quantizing .. ");
|
@@ -2183,12 +2571,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2183
2571
|
}
|
2184
2572
|
|
2185
2573
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
2574
|
+
int64_t tot_count = 0;
|
2186
2575
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2187
2576
|
hist_all[i] += hist_cur[i];
|
2577
|
+
tot_count += hist_cur[i];
|
2188
2578
|
}
|
2189
2579
|
|
2190
|
-
|
2191
|
-
|
2580
|
+
if (tot_count > 0) {
|
2581
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2582
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
2583
|
+
}
|
2192
2584
|
}
|
2193
2585
|
printf("\n");
|
2194
2586
|
}
|
@@ -2206,11 +2598,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2206
2598
|
sum_all += hist_all[i];
|
2207
2599
|
}
|
2208
2600
|
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2601
|
+
if (sum_all > 0) {
|
2602
|
+
printf("%s: hist: ", __func__);
|
2603
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
2604
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
2605
|
+
}
|
2606
|
+
printf("\n");
|
2212
2607
|
}
|
2213
|
-
printf("\n");
|
2214
2608
|
}
|
2215
2609
|
}
|
2216
2610
|
|
@@ -2251,9 +2645,9 @@ struct llama_context * llama_init_from_file(
|
|
2251
2645
|
|
2252
2646
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2253
2647
|
|
2254
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers,
|
2255
|
-
|
2256
|
-
|
2648
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2649
|
+
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2650
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2257
2651
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2258
2652
|
llama_free(ctx);
|
2259
2653
|
return nullptr;
|
@@ -2261,7 +2655,7 @@ struct llama_context * llama_init_from_file(
|
|
2261
2655
|
|
2262
2656
|
// reserve memory for context buffers
|
2263
2657
|
if (!params.vocab_only) {
|
2264
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
2658
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2265
2659
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2266
2660
|
llama_free(ctx);
|
2267
2661
|
return nullptr;
|
@@ -2291,6 +2685,38 @@ struct llama_context * llama_init_from_file(
|
|
2291
2685
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2292
2686
|
}
|
2293
2687
|
|
2688
|
+
#ifdef GGML_USE_METAL
|
2689
|
+
if (params.n_gpu_layers > 0) {
|
2690
|
+
// this allocates all Metal resources and memory buffers
|
2691
|
+
ctx->ctx_metal = ggml_metal_init();
|
2692
|
+
|
2693
|
+
void *data_ptr = NULL;
|
2694
|
+
size_t data_size = 0;
|
2695
|
+
if (params.use_mmap) {
|
2696
|
+
data_ptr = ctx->model.mapping->addr;
|
2697
|
+
data_size= ctx->model.mapping->size;
|
2698
|
+
} else {
|
2699
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2700
|
+
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2701
|
+
}
|
2702
|
+
|
2703
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
2704
|
+
if (!(result)) { \
|
2705
|
+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
2706
|
+
llama_free(ctx); \
|
2707
|
+
return NULL; \
|
2708
|
+
}
|
2709
|
+
|
2710
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2711
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2712
|
+
|
2713
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
2714
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
2715
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2716
|
+
#undef LLAMA_METAL_CHECK_BUF
|
2717
|
+
}
|
2718
|
+
#endif
|
2719
|
+
|
2294
2720
|
return ctx;
|
2295
2721
|
}
|
2296
2722
|
|
@@ -2301,13 +2727,12 @@ void llama_free(struct llama_context * ctx) {
|
|
2301
2727
|
int llama_model_quantize(
|
2302
2728
|
const char * fname_inp,
|
2303
2729
|
const char * fname_out,
|
2304
|
-
|
2305
|
-
int nthread) {
|
2730
|
+
const llama_model_quantize_params *params) {
|
2306
2731
|
try {
|
2307
|
-
llama_model_quantize_internal(fname_inp, fname_out,
|
2732
|
+
llama_model_quantize_internal(fname_inp, fname_out, params);
|
2308
2733
|
return 0;
|
2309
|
-
} catch (const std::
|
2310
|
-
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.
|
2734
|
+
} catch (const std::exception & err) {
|
2735
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
2311
2736
|
return 1;
|
2312
2737
|
}
|
2313
2738
|
}
|
@@ -2560,8 +2985,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2560
2985
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2561
2986
|
try {
|
2562
2987
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2563
|
-
} catch (const std::
|
2564
|
-
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.
|
2988
|
+
} catch (const std::exception & err) {
|
2989
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2565
2990
|
return 1;
|
2566
2991
|
}
|
2567
2992
|
}
|
@@ -2906,7 +3331,7 @@ int llama_eval(
|
|
2906
3331
|
int n_tokens,
|
2907
3332
|
int n_past,
|
2908
3333
|
int n_threads) {
|
2909
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
3334
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
2910
3335
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2911
3336
|
return 1;
|
2912
3337
|
}
|
@@ -2921,6 +3346,20 @@ int llama_eval(
|
|
2921
3346
|
return 0;
|
2922
3347
|
}
|
2923
3348
|
|
3349
|
+
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
3350
|
+
const int n_batch = 1;
|
3351
|
+
const int n_ctx = 512 - n_batch;
|
3352
|
+
|
3353
|
+
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3354
|
+
|
3355
|
+
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3356
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3357
|
+
return 1;
|
3358
|
+
}
|
3359
|
+
|
3360
|
+
return 0;
|
3361
|
+
}
|
3362
|
+
|
2924
3363
|
int llama_tokenize(
|
2925
3364
|
struct llama_context * ctx,
|
2926
3365
|
const char * text,
|
@@ -2953,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
2953
3392
|
return ctx->model.hparams.n_embd;
|
2954
3393
|
}
|
2955
3394
|
|
3395
|
+
int llama_get_vocab(
|
3396
|
+
const struct llama_context * ctx,
|
3397
|
+
const char * * strings,
|
3398
|
+
float * scores,
|
3399
|
+
int capacity) {
|
3400
|
+
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
3401
|
+
for (int i = 0; i<n; ++i) {
|
3402
|
+
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
3403
|
+
scores[i] = ctx->vocab.id_to_token[i].score;
|
3404
|
+
}
|
3405
|
+
return n;
|
3406
|
+
}
|
3407
|
+
|
2956
3408
|
float * llama_get_logits(struct llama_context * ctx) {
|
2957
3409
|
return ctx->logits.data();
|
2958
3410
|
}
|