llama_cpp 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -16,6 +16,10 @@
|
|
16
16
|
#include "ggml-opencl.h"
|
17
17
|
#endif
|
18
18
|
|
19
|
+
#ifdef GGML_USE_METAL
|
20
|
+
#include "ggml-metal.h"
|
21
|
+
#endif
|
22
|
+
|
19
23
|
#include <array>
|
20
24
|
#include <ctime>
|
21
25
|
#include <cinttypes>
|
@@ -49,17 +53,22 @@ enum e_model {
|
|
49
53
|
MODEL_65B,
|
50
54
|
};
|
51
55
|
|
52
|
-
|
53
56
|
static const size_t MB = 1024*1024;
|
54
57
|
|
55
58
|
// computed for n_ctx == 2048
|
56
59
|
// TODO: dynamically determine these sizes
|
57
60
|
// needs modifications in ggml
|
58
61
|
|
62
|
+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
63
|
+
|
64
|
+
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
65
|
+
(void) tensor;
|
66
|
+
}
|
67
|
+
|
59
68
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
60
69
|
{
|
61
70
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
-
{ MODEL_3B,
|
71
|
+
{ MODEL_3B, 256ull * MB },
|
63
72
|
{ MODEL_7B, 512ull * MB },
|
64
73
|
{ MODEL_13B, 512ull * MB },
|
65
74
|
{ MODEL_30B, 512ull * MB },
|
@@ -71,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
71
80
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
72
81
|
{
|
73
82
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
-
{ MODEL_3B,
|
83
|
+
{ MODEL_3B, 256ull * MB },
|
75
84
|
{ MODEL_7B, 512ull * MB },
|
76
85
|
{ MODEL_13B, 512ull * MB },
|
77
86
|
{ MODEL_30B, 512ull * MB },
|
@@ -156,6 +165,11 @@ struct llama_kv_cache {
|
|
156
165
|
if (ctx) {
|
157
166
|
ggml_free(ctx);
|
158
167
|
}
|
168
|
+
|
169
|
+
#ifdef GGML_USE_CUBLAS
|
170
|
+
ggml_cuda_free_data(k);
|
171
|
+
ggml_cuda_free_data(v);
|
172
|
+
#endif // GGML_USE_CUBLAS
|
159
173
|
}
|
160
174
|
};
|
161
175
|
|
@@ -170,6 +184,7 @@ struct llama_model {
|
|
170
184
|
struct ggml_tensor * output;
|
171
185
|
|
172
186
|
std::vector<llama_layer> layers;
|
187
|
+
int n_gpu_layers;
|
173
188
|
|
174
189
|
// context
|
175
190
|
struct ggml_context * ctx = NULL;
|
@@ -195,6 +210,17 @@ struct llama_model {
|
|
195
210
|
if (ctx) {
|
196
211
|
ggml_free(ctx);
|
197
212
|
}
|
213
|
+
|
214
|
+
#ifdef GGML_USE_CUBLAS
|
215
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
216
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
217
|
+
}
|
218
|
+
ggml_cuda_free_scratch();
|
219
|
+
#elif defined(GGML_USE_CLBLAST)
|
220
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
221
|
+
ggml_cl_free_data(tensors_by_name[i].second);
|
222
|
+
}
|
223
|
+
#endif
|
198
224
|
}
|
199
225
|
};
|
200
226
|
|
@@ -243,6 +269,10 @@ struct llama_context {
|
|
243
269
|
llama_ctx_buffer buf_compute;
|
244
270
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
245
271
|
|
272
|
+
#ifdef GGML_USE_METAL
|
273
|
+
ggml_metal_context * ctx_metal = NULL;
|
274
|
+
#endif
|
275
|
+
|
246
276
|
int buf_last = 0;
|
247
277
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
248
278
|
|
@@ -282,15 +312,15 @@ template <typename T>
|
|
282
312
|
static T checked_mul(T a, T b) {
|
283
313
|
T ret = a * b;
|
284
314
|
if (a != 0 && ret / a != b) {
|
285
|
-
throw format("overflow multiplying %llu * %llu",
|
286
|
-
(unsigned long long) a, (unsigned long long) b);
|
315
|
+
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
316
|
+
(unsigned long long) a, (unsigned long long) b));
|
287
317
|
}
|
288
318
|
return ret;
|
289
319
|
}
|
290
320
|
|
291
321
|
static size_t checked_div(size_t a, size_t b) {
|
292
322
|
if (b == 0 || a % b != 0) {
|
293
|
-
throw format("error dividing %zu / %zu", a, b);
|
323
|
+
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
294
324
|
}
|
295
325
|
return a / b;
|
296
326
|
}
|
@@ -354,7 +384,7 @@ struct llama_load_tensor {
|
|
354
384
|
const auto & first_shard = shards.at(0);
|
355
385
|
for (const auto & shard : shards) {
|
356
386
|
if (shard.type != first_shard.type) {
|
357
|
-
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
387
|
+
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
358
388
|
}
|
359
389
|
}
|
360
390
|
type = first_shard.type;
|
@@ -377,8 +407,8 @@ struct llama_load_tensor {
|
|
377
407
|
const auto & first_shard = shards.at(0);
|
378
408
|
for (const auto & shard : shards) {
|
379
409
|
if (shard.ne != first_shard.ne) {
|
380
|
-
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
381
|
-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
410
|
+
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
411
|
+
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
382
412
|
}
|
383
413
|
}
|
384
414
|
ne = first_shard.ne;
|
@@ -456,8 +486,8 @@ struct llama_file_loader {
|
|
456
486
|
}
|
457
487
|
}
|
458
488
|
|
459
|
-
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
460
|
-
magic, version);
|
489
|
+
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
490
|
+
magic, version));
|
461
491
|
}
|
462
492
|
void read_hparams() {
|
463
493
|
hparams.n_vocab = file.read_u32();
|
@@ -497,7 +527,7 @@ struct llama_file_loader {
|
|
497
527
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
498
528
|
std::string name = file.read_string(name_len);
|
499
529
|
if (n_dims < 1 || n_dims > 2) {
|
500
|
-
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
530
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
501
531
|
}
|
502
532
|
switch (shard.type) {
|
503
533
|
case GGML_TYPE_F32:
|
@@ -507,9 +537,14 @@ struct llama_file_loader {
|
|
507
537
|
case GGML_TYPE_Q5_0:
|
508
538
|
case GGML_TYPE_Q5_1:
|
509
539
|
case GGML_TYPE_Q8_0:
|
540
|
+
case GGML_TYPE_Q2_K:
|
541
|
+
case GGML_TYPE_Q3_K:
|
542
|
+
case GGML_TYPE_Q4_K:
|
543
|
+
case GGML_TYPE_Q5_K:
|
544
|
+
case GGML_TYPE_Q6_K:
|
510
545
|
break;
|
511
546
|
default: {
|
512
|
-
throw format("unrecognized tensor type %u\n", shard.type);
|
547
|
+
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
513
548
|
}
|
514
549
|
}
|
515
550
|
|
@@ -582,6 +617,11 @@ struct llama_file_saver {
|
|
582
617
|
case GGML_TYPE_Q5_0:
|
583
618
|
case GGML_TYPE_Q5_1:
|
584
619
|
case GGML_TYPE_Q8_0:
|
620
|
+
case GGML_TYPE_Q2_K:
|
621
|
+
case GGML_TYPE_Q3_K:
|
622
|
+
case GGML_TYPE_Q4_K:
|
623
|
+
case GGML_TYPE_Q5_K:
|
624
|
+
case GGML_TYPE_Q6_K:
|
585
625
|
break;
|
586
626
|
default: LLAMA_ASSERT(false);
|
587
627
|
}
|
@@ -613,7 +653,7 @@ struct llama_model_loader {
|
|
613
653
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
614
654
|
file_loaders.emplace_back(ith_file);
|
615
655
|
if (ith_file->hparams != first_file->hparams) {
|
616
|
-
throw format("llama.cpp: hparams inconsistent between files");
|
656
|
+
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
617
657
|
}
|
618
658
|
}
|
619
659
|
if (!llama_mmap::SUPPORTED) {
|
@@ -643,7 +683,7 @@ struct llama_model_loader {
|
|
643
683
|
uint32_t guess_n_parts() const {
|
644
684
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
645
685
|
if (it == tensors_map.name_to_idx.end()) {
|
646
|
-
throw std::string("missing tok_embeddings.weight");
|
686
|
+
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
647
687
|
}
|
648
688
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
649
689
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
@@ -660,12 +700,12 @@ struct llama_model_loader {
|
|
660
700
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
661
701
|
auto it = tensors_map.name_to_idx.find(name);
|
662
702
|
if (it == tensors_map.name_to_idx.end()) {
|
663
|
-
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
703
|
+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
664
704
|
}
|
665
705
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
666
706
|
if (lt.ne != ne) {
|
667
|
-
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
668
|
-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
707
|
+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
708
|
+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
669
709
|
}
|
670
710
|
|
671
711
|
return get_tensor_for(lt, backend);
|
@@ -673,6 +713,9 @@ struct llama_model_loader {
|
|
673
713
|
|
674
714
|
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
675
715
|
struct ggml_tensor * tensor;
|
716
|
+
if (backend != GGML_BACKEND_CPU) {
|
717
|
+
ggml_set_no_alloc(ggml_ctx, true);
|
718
|
+
}
|
676
719
|
if (lt.ne.size() == 2) {
|
677
720
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
678
721
|
} else {
|
@@ -681,6 +724,10 @@ struct llama_model_loader {
|
|
681
724
|
}
|
682
725
|
ggml_set_name(tensor, lt.name.c_str());
|
683
726
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
727
|
+
|
728
|
+
if (backend != GGML_BACKEND_CPU) {
|
729
|
+
ggml_set_no_alloc(ggml_ctx, use_mmap);
|
730
|
+
}
|
684
731
|
tensor->backend = backend;
|
685
732
|
lt.ggml_tensor = tensor;
|
686
733
|
num_ggml_tensors_created++;
|
@@ -689,13 +736,14 @@ struct llama_model_loader {
|
|
689
736
|
|
690
737
|
void done_getting_tensors() const {
|
691
738
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
692
|
-
throw std::string("llama.cpp: file contained more tensors than expected");
|
739
|
+
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
693
740
|
}
|
694
741
|
}
|
695
742
|
|
696
743
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
697
744
|
size_t data_size = 0;
|
698
745
|
size_t prefetch_size = 0;
|
746
|
+
size_t lock_size = 0;
|
699
747
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
700
748
|
data_size += lt.size;
|
701
749
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
@@ -705,11 +753,6 @@ struct llama_model_loader {
|
|
705
753
|
|
706
754
|
if (use_mmap) {
|
707
755
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
708
|
-
if (!lmlock) {
|
709
|
-
// Don't call the callback since the actual loading will be lazy
|
710
|
-
// and we can't measure it.
|
711
|
-
progress_callback = NULL;
|
712
|
-
}
|
713
756
|
if (lmlock) {
|
714
757
|
lmlock->init(mapping->addr);
|
715
758
|
}
|
@@ -717,20 +760,49 @@ struct llama_model_loader {
|
|
717
760
|
|
718
761
|
size_t done_size = 0;
|
719
762
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
720
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
721
|
-
continue;
|
722
|
-
}
|
723
763
|
if (progress_callback) {
|
724
764
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
725
765
|
}
|
726
766
|
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
727
767
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
768
|
+
|
769
|
+
// allocate temp buffer if not using mmap
|
770
|
+
if (!use_mmap && lt.data == NULL) {
|
771
|
+
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
|
772
|
+
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
|
773
|
+
}
|
774
|
+
|
728
775
|
load_data_for(lt);
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
776
|
+
|
777
|
+
switch(lt.ggml_tensor->backend) {
|
778
|
+
case GGML_BACKEND_CPU:
|
779
|
+
lt.ggml_tensor->data = lt.data;
|
780
|
+
if (use_mmap && lmlock) {
|
781
|
+
lock_size += lt.size;
|
782
|
+
lmlock->grow_to(lock_size);
|
783
|
+
}
|
784
|
+
break;
|
785
|
+
#if defined(GGML_USE_CUBLAS)
|
786
|
+
case GGML_BACKEND_GPU:
|
787
|
+
case GGML_BACKEND_GPU_SPLIT:
|
788
|
+
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
789
|
+
if (!use_mmap) {
|
790
|
+
free(lt.data);
|
791
|
+
}
|
792
|
+
break;
|
793
|
+
#elif defined(GGML_USE_CLBLAST)
|
794
|
+
case GGML_BACKEND_GPU:
|
795
|
+
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
|
796
|
+
if (!use_mmap) {
|
797
|
+
free(lt.data);
|
798
|
+
}
|
799
|
+
break;
|
800
|
+
#endif
|
801
|
+
default:
|
802
|
+
continue;
|
733
803
|
}
|
804
|
+
|
805
|
+
done_size += lt.size;
|
734
806
|
}
|
735
807
|
}
|
736
808
|
|
@@ -801,7 +873,8 @@ static bool kv_cache_init(
|
|
801
873
|
const struct llama_hparams & hparams,
|
802
874
|
struct llama_kv_cache & cache,
|
803
875
|
ggml_type wtype,
|
804
|
-
int n_ctx
|
876
|
+
int n_ctx,
|
877
|
+
int n_gpu_layers) {
|
805
878
|
const int n_embd = hparams.n_embd;
|
806
879
|
const int n_layer = hparams.n_layer;
|
807
880
|
|
@@ -827,13 +900,26 @@ static bool kv_cache_init(
|
|
827
900
|
ggml_set_name(cache.k, "cache_k");
|
828
901
|
ggml_set_name(cache.v, "cache_v");
|
829
902
|
|
903
|
+
#ifdef GGML_USE_CUBLAS
|
904
|
+
if (n_gpu_layers > n_layer + 1) {
|
905
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
906
|
+
}
|
907
|
+
if (n_gpu_layers > n_layer + 2) {
|
908
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
909
|
+
}
|
910
|
+
#endif // GGML_USE_CUBLAS
|
911
|
+
|
830
912
|
return true;
|
831
913
|
}
|
832
914
|
|
833
915
|
struct llama_context_params llama_context_default_params() {
|
834
916
|
struct llama_context_params result = {
|
835
917
|
/*.n_ctx =*/ 512,
|
918
|
+
/*.n_batch =*/ 512,
|
836
919
|
/*.gpu_layers =*/ 0,
|
920
|
+
/*.main_gpu =*/ 0,
|
921
|
+
/*.tensor_split =*/ {0},
|
922
|
+
/*.low_vram =*/ false,
|
837
923
|
/*.seed =*/ -1,
|
838
924
|
/*.f16_kv =*/ true,
|
839
925
|
/*.logits_all =*/ false,
|
@@ -848,6 +934,17 @@ struct llama_context_params llama_context_default_params() {
|
|
848
934
|
return result;
|
849
935
|
}
|
850
936
|
|
937
|
+
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
938
|
+
struct llama_model_quantize_params result = {
|
939
|
+
/*.nthread =*/ 0,
|
940
|
+
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
941
|
+
/*.allow_requantize =*/ false,
|
942
|
+
/*.quantize_output_tensor =*/ true,
|
943
|
+
};
|
944
|
+
|
945
|
+
return result;
|
946
|
+
}
|
947
|
+
|
851
948
|
bool llama_mmap_supported() {
|
852
949
|
return llama_mmap::SUPPORTED;
|
853
950
|
}
|
@@ -898,6 +995,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
898
995
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
899
996
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
900
997
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
998
|
+
// K-quants
|
999
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
1000
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
1001
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
1002
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
1003
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
1004
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
1005
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
1006
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
1007
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
901
1008
|
default: return "unknown, may not work";
|
902
1009
|
}
|
903
1010
|
}
|
@@ -917,7 +1024,11 @@ static void llama_model_load_internal(
|
|
917
1024
|
const std::string & fname,
|
918
1025
|
llama_context & lctx,
|
919
1026
|
int n_ctx,
|
1027
|
+
int n_batch,
|
920
1028
|
int n_gpu_layers,
|
1029
|
+
int main_gpu,
|
1030
|
+
const float * tensor_split,
|
1031
|
+
bool low_vram,
|
921
1032
|
ggml_type memory_type,
|
922
1033
|
bool use_mmap,
|
923
1034
|
bool use_mlock,
|
@@ -932,9 +1043,9 @@ static void llama_model_load_internal(
|
|
932
1043
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
933
1044
|
auto & model = lctx.model;
|
934
1045
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
1046
|
+
model.n_gpu_layers = n_gpu_layers;
|
935
1047
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
936
1048
|
auto & hparams = model.hparams;
|
937
|
-
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
938
1049
|
|
939
1050
|
{
|
940
1051
|
switch (hparams.n_layer) {
|
@@ -943,11 +1054,19 @@ static void llama_model_load_internal(
|
|
943
1054
|
case 40: model.type = e_model::MODEL_13B; break;
|
944
1055
|
case 60: model.type = e_model::MODEL_30B; break;
|
945
1056
|
case 80: model.type = e_model::MODEL_65B; break;
|
1057
|
+
default:
|
1058
|
+
{
|
1059
|
+
if (hparams.n_layer < 32) {
|
1060
|
+
model.type = e_model::MODEL_7B;
|
1061
|
+
}
|
1062
|
+
} break;
|
946
1063
|
}
|
947
1064
|
|
948
1065
|
hparams.n_ctx = n_ctx;
|
949
1066
|
}
|
950
1067
|
|
1068
|
+
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1069
|
+
|
951
1070
|
{
|
952
1071
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
953
1072
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
@@ -967,7 +1086,7 @@ static void llama_model_load_internal(
|
|
967
1086
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
968
1087
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
969
1088
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
970
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
1089
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
971
1090
|
}
|
972
1091
|
}
|
973
1092
|
|
@@ -975,7 +1094,7 @@ static void llama_model_load_internal(
|
|
975
1094
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
976
1095
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
977
1096
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
978
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
1097
|
+
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
979
1098
|
}
|
980
1099
|
}
|
981
1100
|
|
@@ -1006,18 +1125,28 @@ static void llama_model_load_internal(
|
|
1006
1125
|
|
1007
1126
|
model.ctx = ggml_init(params);
|
1008
1127
|
if (!model.ctx) {
|
1009
|
-
throw format("ggml_init() failed");
|
1128
|
+
throw std::runtime_error(format("ggml_init() failed"));
|
1010
1129
|
}
|
1011
1130
|
}
|
1012
1131
|
|
1013
|
-
|
1014
|
-
#
|
1132
|
+
(void) main_gpu;
|
1133
|
+
#if defined(GGML_USE_CUBLAS)
|
1134
|
+
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
1135
|
+
ggml_cuda_set_main_device(main_gpu);
|
1136
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1137
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1138
|
+
#elif defined(GGML_USE_CLBLAST)
|
1139
|
+
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
1140
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1141
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
1015
1142
|
#else
|
1016
|
-
#define LLAMA_BACKEND_OFFLOAD
|
1143
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1144
|
+
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
1017
1145
|
#endif
|
1018
1146
|
|
1019
1147
|
// prepare memory for the weights
|
1020
|
-
size_t
|
1148
|
+
size_t vram_weights = 0;
|
1149
|
+
size_t vram_scratch = 0;
|
1021
1150
|
{
|
1022
1151
|
const uint32_t n_embd = hparams.n_embd;
|
1023
1152
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -1026,25 +1155,42 @@ static void llama_model_load_internal(
|
|
1026
1155
|
ml->ggml_ctx = ctx;
|
1027
1156
|
|
1028
1157
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1029
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1030
1158
|
|
1031
1159
|
// "output" tensor
|
1032
1160
|
{
|
1161
|
+
ggml_backend backend_norm;
|
1033
1162
|
ggml_backend backend_output;
|
1034
1163
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1035
|
-
|
1164
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1165
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
1166
|
+
#ifndef _WIN32
|
1167
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1168
|
+
#else
|
1169
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1170
|
+
#endif // _WIN32
|
1171
|
+
|
1172
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1036
1173
|
} else {
|
1174
|
+
backend_norm = GGML_BACKEND_CPU;
|
1037
1175
|
backend_output = GGML_BACKEND_CPU;
|
1038
1176
|
}
|
1039
1177
|
|
1178
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
|
1040
1179
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1180
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
1181
|
+
vram_weights += ggml_nbytes(model.norm);
|
1182
|
+
}
|
1183
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
1184
|
+
vram_weights += ggml_nbytes(model.output);
|
1185
|
+
}
|
1041
1186
|
}
|
1042
1187
|
|
1043
1188
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1044
1189
|
|
1045
1190
|
model.layers.resize(n_layer);
|
1046
1191
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1047
|
-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1192
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1193
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1048
1194
|
|
1049
1195
|
auto & layer = model.layers[i];
|
1050
1196
|
|
@@ -1052,21 +1198,21 @@ static void llama_model_load_internal(
|
|
1052
1198
|
|
1053
1199
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1054
1200
|
|
1055
|
-
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd},
|
1056
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd},
|
1057
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd},
|
1058
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd},
|
1201
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
|
1202
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split);
|
1203
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split);
|
1204
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
|
1059
1205
|
|
1060
1206
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1061
1207
|
|
1062
|
-
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff},
|
1063
|
-
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd},
|
1064
|
-
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff},
|
1208
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split);
|
1209
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split);
|
1210
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split);
|
1065
1211
|
|
1066
|
-
if (backend ==
|
1067
|
-
|
1212
|
+
if (backend == GGML_BACKEND_GPU) {
|
1213
|
+
vram_weights +=
|
1068
1214
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1069
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.
|
1215
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
1070
1216
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1071
1217
|
}
|
1072
1218
|
}
|
@@ -1081,10 +1227,10 @@ static void llama_model_load_internal(
|
|
1081
1227
|
// this is the total memory required to run the inference
|
1082
1228
|
const size_t mem_required =
|
1083
1229
|
ctx_size +
|
1084
|
-
mmapped_size -
|
1230
|
+
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1085
1231
|
MEM_REQ_SCRATCH0().at(model.type) +
|
1086
1232
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1087
|
-
MEM_REQ_EVAL().at(model.type);
|
1233
|
+
MEM_REQ_EVAL().at (model.type);
|
1088
1234
|
|
1089
1235
|
// this is the memory required by one llama_state
|
1090
1236
|
const size_t mem_required_state =
|
@@ -1093,15 +1239,51 @@ static void llama_model_load_internal(
|
|
1093
1239
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1094
1240
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1095
1241
|
|
1242
|
+
(void) vram_scratch;
|
1243
|
+
(void) n_batch;
|
1096
1244
|
#ifdef GGML_USE_CUBLAS
|
1245
|
+
if (low_vram) {
|
1246
|
+
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1247
|
+
ggml_cuda_set_scratch_size(0); // disable scratch
|
1248
|
+
} else {
|
1249
|
+
vram_scratch = n_batch * MB;
|
1250
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
|
+
if (n_gpu_layers > 0) {
|
1252
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1253
|
+
__func__, vram_scratch / MB);
|
1254
|
+
}
|
1255
|
+
}
|
1256
|
+
#endif // GGML_USE_CUBLAS
|
1257
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1097
1258
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1098
1259
|
|
1099
|
-
fprintf(stderr, "%s:
|
1260
|
+
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1100
1261
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1101
|
-
fprintf(stderr, "%s:
|
1262
|
+
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1263
|
+
}
|
1264
|
+
size_t vram_kv_cache = 0;
|
1265
|
+
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1266
|
+
if (low_vram) {
|
1267
|
+
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1268
|
+
} else {
|
1269
|
+
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1270
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1271
|
+
}
|
1102
1272
|
}
|
1103
|
-
|
1104
|
-
|
1273
|
+
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1274
|
+
if (low_vram) {
|
1275
|
+
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1276
|
+
} else {
|
1277
|
+
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1278
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1279
|
+
}
|
1280
|
+
}
|
1281
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1282
|
+
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1283
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
1284
|
+
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1285
|
+
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1286
|
+
#else
|
1105
1287
|
(void) n_gpu_layers;
|
1106
1288
|
#endif
|
1107
1289
|
}
|
@@ -1111,57 +1293,15 @@ static void llama_model_load_internal(
|
|
1111
1293
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
1112
1294
|
}
|
1113
1295
|
|
1114
|
-
|
1115
|
-
|
1116
|
-
#ifdef GGML_USE_CUBLAS
|
1296
|
+
(void) tensor_split;
|
1297
|
+
#if defined(GGML_USE_CUBLAS)
|
1117
1298
|
{
|
1118
|
-
|
1119
|
-
size_t data_size = 0;
|
1120
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1121
|
-
data_size += lt.size;
|
1122
|
-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1123
|
-
done_size += lt.size;
|
1124
|
-
}
|
1125
|
-
}
|
1126
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1127
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1128
|
-
continue;
|
1129
|
-
}
|
1130
|
-
if (progress_callback) {
|
1131
|
-
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1132
|
-
}
|
1133
|
-
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1134
|
-
done_size += lt.size;
|
1135
|
-
}
|
1136
|
-
}
|
1137
|
-
#elif defined(GGML_USE_CLBLAST)
|
1138
|
-
{
|
1139
|
-
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1140
|
-
|
1141
|
-
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1142
|
-
|
1143
|
-
size_t vram_total = 0;
|
1144
|
-
|
1145
|
-
for (int i = 0; i < n_gpu; ++i) {
|
1146
|
-
const auto & layer = model.layers[i];
|
1147
|
-
|
1148
|
-
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1149
|
-
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1150
|
-
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1151
|
-
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1152
|
-
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1153
|
-
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1154
|
-
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1155
|
-
}
|
1156
|
-
if (n_gpu_layers > (int) hparams.n_layer) {
|
1157
|
-
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1158
|
-
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1159
|
-
}
|
1160
|
-
|
1161
|
-
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1299
|
+
ggml_cuda_set_tensor_split(tensor_split);
|
1162
1300
|
}
|
1163
1301
|
#endif
|
1164
1302
|
|
1303
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1304
|
+
|
1165
1305
|
if (progress_callback) {
|
1166
1306
|
progress_callback(1.0f, progress_callback_user_data);
|
1167
1307
|
}
|
@@ -1177,7 +1317,11 @@ static bool llama_model_load(
|
|
1177
1317
|
const std::string & fname,
|
1178
1318
|
llama_context & lctx,
|
1179
1319
|
int n_ctx,
|
1320
|
+
int n_batch,
|
1180
1321
|
int n_gpu_layers,
|
1322
|
+
int main_gpu,
|
1323
|
+
float * tensor_split,
|
1324
|
+
bool low_vram,
|
1181
1325
|
ggml_type memory_type,
|
1182
1326
|
bool use_mmap,
|
1183
1327
|
bool use_mlock,
|
@@ -1185,28 +1329,30 @@ static bool llama_model_load(
|
|
1185
1329
|
llama_progress_callback progress_callback,
|
1186
1330
|
void *progress_callback_user_data) {
|
1187
1331
|
try {
|
1188
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers,
|
1189
|
-
vocab_only, progress_callback, progress_callback_user_data);
|
1332
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1333
|
+
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1190
1334
|
return true;
|
1191
|
-
} catch (const std::
|
1192
|
-
fprintf(stderr, "error loading model: %s\n", err.
|
1335
|
+
} catch (const std::exception & err) {
|
1336
|
+
fprintf(stderr, "error loading model: %s\n", err.what());
|
1193
1337
|
return false;
|
1194
1338
|
}
|
1195
1339
|
}
|
1196
1340
|
|
1197
1341
|
// evaluate the transformer
|
1198
1342
|
//
|
1199
|
-
// - lctx:
|
1200
|
-
// - tokens:
|
1201
|
-
// - n_past:
|
1202
|
-
// - n_threads:
|
1343
|
+
// - lctx: llama context
|
1344
|
+
// - tokens: new batch of tokens to process
|
1345
|
+
// - n_past: the context size so far
|
1346
|
+
// - n_threads: number of threads to use
|
1347
|
+
// - cgraph_fname: filename of the exported computation graph
|
1203
1348
|
//
|
1204
1349
|
static bool llama_eval_internal(
|
1205
|
-
llama_context &
|
1206
|
-
const llama_token *
|
1207
|
-
const int
|
1208
|
-
const int
|
1209
|
-
const int
|
1350
|
+
llama_context & lctx,
|
1351
|
+
const llama_token * tokens,
|
1352
|
+
const int n_tokens,
|
1353
|
+
const int n_past,
|
1354
|
+
const int n_threads,
|
1355
|
+
const char * cgraph_fname) {
|
1210
1356
|
|
1211
1357
|
// enforce that the first token is BOS
|
1212
1358
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
@@ -1225,12 +1371,13 @@ static bool llama_eval_internal(
|
|
1225
1371
|
|
1226
1372
|
LLAMA_ASSERT(!!kv_self.ctx);
|
1227
1373
|
|
1228
|
-
const int n_embd
|
1229
|
-
const int n_layer
|
1230
|
-
const int n_ctx
|
1231
|
-
const int n_head
|
1232
|
-
const int n_vocab
|
1233
|
-
const int n_rot
|
1374
|
+
const int n_embd = hparams.n_embd;
|
1375
|
+
const int n_layer = hparams.n_layer;
|
1376
|
+
const int n_ctx = hparams.n_ctx;
|
1377
|
+
const int n_head = hparams.n_head;
|
1378
|
+
const int n_vocab = hparams.n_vocab;
|
1379
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
1380
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
1234
1381
|
|
1235
1382
|
auto & mem_per_token = lctx.mem_per_token;
|
1236
1383
|
auto & buf_compute = lctx.buf_compute;
|
@@ -1252,40 +1399,98 @@ static bool llama_eval_internal(
|
|
1252
1399
|
ggml_set_name(embd, "embd");
|
1253
1400
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
1254
1401
|
|
1402
|
+
struct ggml_tensor * cur;
|
1255
1403
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
1256
1404
|
|
1405
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
1406
|
+
(void) i_gpu_start;
|
1407
|
+
|
1408
|
+
// offload functions set the tensor output backend to GPU
|
1409
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
1410
|
+
//
|
1411
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
1412
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
1413
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
1414
|
+
offload_func_t offload_func_kq = llama_nop;
|
1415
|
+
offload_func_t offload_func_v = llama_nop;
|
1416
|
+
|
1417
|
+
#ifdef GGML_USE_CUBLAS
|
1418
|
+
if (n_gpu_layers > n_layer) {
|
1419
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1420
|
+
}
|
1421
|
+
if (n_gpu_layers > n_layer + 1) {
|
1422
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1423
|
+
}
|
1424
|
+
if (n_gpu_layers > n_layer + 2) {
|
1425
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1426
|
+
}
|
1427
|
+
#endif // GGML_USE_CUBLAS
|
1428
|
+
|
1257
1429
|
for (int il = 0; il < n_layer; ++il) {
|
1258
|
-
|
1430
|
+
offload_func_t offload_func = llama_nop;
|
1431
|
+
|
1432
|
+
#ifdef GGML_USE_CUBLAS
|
1433
|
+
if (il >= i_gpu_start) {
|
1434
|
+
offload_func = ggml_cuda_assign_buffers;
|
1435
|
+
}
|
1436
|
+
#endif // GGML_USE_CUBLAS
|
1259
1437
|
|
1260
|
-
struct ggml_tensor *
|
1438
|
+
struct ggml_tensor * inpSA = inpL;
|
1261
1439
|
|
1262
1440
|
lctx.use_buf(ctx0, 0);
|
1263
1441
|
|
1264
1442
|
// norm
|
1265
1443
|
{
|
1266
1444
|
cur = ggml_rms_norm(ctx0, inpL);
|
1445
|
+
offload_func(cur);
|
1446
|
+
ggml_set_name(cur, "rms_norm_0");
|
1267
1447
|
|
1268
1448
|
// cur = cur*attention_norm(broadcasted)
|
1269
1449
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1450
|
+
offload_func(cur);
|
1451
|
+
ggml_set_name(cur, "attention_norm_0");
|
1270
1452
|
}
|
1271
1453
|
|
1272
1454
|
// self-attention
|
1273
1455
|
{
|
1274
1456
|
// compute Q and K and RoPE them
|
1275
|
-
struct ggml_tensor *
|
1276
|
-
|
1277
|
-
ggml_set_name(
|
1457
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1458
|
+
offload_func_kq(tmpk);
|
1459
|
+
ggml_set_name(tmpk, "tmpk");
|
1460
|
+
|
1461
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1462
|
+
offload_func_kq(tmpq);
|
1463
|
+
ggml_set_name(tmpq, "tmpq");
|
1464
|
+
|
1465
|
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1466
|
+
offload_func_kq(Kcur);
|
1278
1467
|
ggml_set_name(Kcur, "Kcur");
|
1279
1468
|
|
1469
|
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1470
|
+
offload_func_kq(Qcur);
|
1471
|
+
ggml_set_name(Qcur, "Qcur");
|
1472
|
+
|
1280
1473
|
// store key and value to memory
|
1281
1474
|
{
|
1282
1475
|
// compute the transposed [N, n_embd] V matrix
|
1283
|
-
|
1476
|
+
|
1477
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
1478
|
+
offload_func_v(tmpv);
|
1479
|
+
ggml_set_name(tmpv, "tmpv");
|
1480
|
+
|
1481
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
|
1482
|
+
offload_func_v(Vcur);
|
1483
|
+
ggml_set_name(Vcur, "Vcur");
|
1284
1484
|
|
1285
1485
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1486
|
+
offload_func_kq(k);
|
1487
|
+
ggml_set_name(k, "k");
|
1488
|
+
|
1286
1489
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1287
1490
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1288
1491
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1492
|
+
offload_func_v(v);
|
1493
|
+
ggml_set_name(v, "v");
|
1289
1494
|
|
1290
1495
|
// important: storing RoPE-ed version of K in the KV cache!
|
1291
1496
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -1296,6 +1501,7 @@ static bool llama_eval_internal(
|
|
1296
1501
|
ggml_permute(ctx0,
|
1297
1502
|
Qcur,
|
1298
1503
|
0, 2, 1, 3);
|
1504
|
+
offload_func_kq(Q);
|
1299
1505
|
ggml_set_name(Q, "Q");
|
1300
1506
|
|
1301
1507
|
struct ggml_tensor * K =
|
@@ -1304,10 +1510,12 @@ static bool llama_eval_internal(
|
|
1304
1510
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1305
1511
|
n_embd/n_head, n_head, n_past + N),
|
1306
1512
|
0, 2, 1, 3);
|
1513
|
+
offload_func_kq(K);
|
1307
1514
|
ggml_set_name(K, "K");
|
1308
1515
|
|
1309
1516
|
// K * Q
|
1310
1517
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1518
|
+
offload_func_kq(KQ);
|
1311
1519
|
ggml_set_name(KQ, "KQ");
|
1312
1520
|
|
1313
1521
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
@@ -1316,17 +1524,19 @@ static bool llama_eval_internal(
|
|
1316
1524
|
|
1317
1525
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1318
1526
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1527
|
+
offload_func_kq(KQ_scaled);
|
1319
1528
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1320
1529
|
|
1321
1530
|
// KQ_masked = mask_past(KQ_scaled)
|
1322
1531
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1532
|
+
offload_func_kq(KQ_masked);
|
1323
1533
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1324
1534
|
|
1325
1535
|
// KQ = soft_max(KQ_masked)
|
1326
1536
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1537
|
+
offload_func_v(KQ_soft_max);
|
1327
1538
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1328
1539
|
|
1329
|
-
|
1330
1540
|
// split cached V into n_head heads
|
1331
1541
|
struct ggml_tensor * V =
|
1332
1542
|
ggml_view_3d(ctx0, kv_self.v,
|
@@ -1334,10 +1544,12 @@ static bool llama_eval_internal(
|
|
1334
1544
|
n_ctx*ggml_element_size(kv_self.v),
|
1335
1545
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1336
1546
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1547
|
+
offload_func_v(V);
|
1337
1548
|
ggml_set_name(V, "V");
|
1338
1549
|
|
1339
1550
|
#if 1
|
1340
1551
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1552
|
+
offload_func_v(KQV);
|
1341
1553
|
ggml_set_name(KQV, "KQV");
|
1342
1554
|
#else
|
1343
1555
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
@@ -1349,56 +1561,79 @@ static bool llama_eval_internal(
|
|
1349
1561
|
|
1350
1562
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1351
1563
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1564
|
+
offload_func_v(KQV_merged);
|
1352
1565
|
ggml_set_name(KQV_merged, "KQV_merged");
|
1353
1566
|
|
1354
1567
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1355
1568
|
cur = ggml_cpy(ctx0,
|
1356
1569
|
KQV_merged,
|
1357
1570
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1571
|
+
offload_func_v(cur);
|
1358
1572
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
1359
1573
|
|
1360
1574
|
// projection (no bias)
|
1361
1575
|
cur = ggml_mul_mat(ctx0,
|
1362
1576
|
model.layers[il].wo,
|
1363
1577
|
cur);
|
1578
|
+
offload_func(cur);
|
1579
|
+
ggml_set_name(cur, "result_wo");
|
1364
1580
|
}
|
1365
1581
|
|
1366
1582
|
lctx.use_buf(ctx0, 1);
|
1367
1583
|
|
1368
1584
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1585
|
+
offload_func(inpFF);
|
1586
|
+
ggml_set_name(inpFF, "inpFF");
|
1369
1587
|
|
1370
1588
|
// feed-forward network
|
1371
1589
|
{
|
1372
1590
|
// norm
|
1373
1591
|
{
|
1374
1592
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1593
|
+
offload_func(cur);
|
1594
|
+
ggml_set_name(cur, "rms_norm_1");
|
1375
1595
|
|
1376
1596
|
// cur = cur*ffn_norm(broadcasted)
|
1377
1597
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1598
|
+
offload_func(cur);
|
1599
|
+
ggml_set_name(cur, "ffn_norm");
|
1378
1600
|
}
|
1379
1601
|
|
1380
1602
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
1381
1603
|
model.layers[il].w3,
|
1382
1604
|
cur);
|
1605
|
+
offload_func(tmp);
|
1606
|
+
ggml_set_name(tmp, "result_w3");
|
1383
1607
|
|
1384
1608
|
cur = ggml_mul_mat(ctx0,
|
1385
1609
|
model.layers[il].w1,
|
1386
1610
|
cur);
|
1611
|
+
offload_func(cur);
|
1612
|
+
ggml_set_name(cur, "result_w2");
|
1387
1613
|
|
1388
1614
|
// SILU activation
|
1389
1615
|
cur = ggml_silu(ctx0, cur);
|
1616
|
+
offload_func(cur);
|
1617
|
+
ggml_set_name(cur, "silu");
|
1390
1618
|
|
1391
1619
|
cur = ggml_mul(ctx0, cur, tmp);
|
1620
|
+
offload_func(cur);
|
1621
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
1392
1622
|
|
1393
1623
|
cur = ggml_mul_mat(ctx0,
|
1394
1624
|
model.layers[il].w2,
|
1395
1625
|
cur);
|
1626
|
+
offload_func(cur);
|
1627
|
+
ggml_set_name(cur, "result_w2");
|
1396
1628
|
}
|
1397
1629
|
|
1398
1630
|
cur = ggml_add(ctx0, cur, inpFF);
|
1631
|
+
offload_func(cur);
|
1632
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
1399
1633
|
|
1400
1634
|
// input for next layer
|
1401
1635
|
inpL = cur;
|
1636
|
+
|
1402
1637
|
}
|
1403
1638
|
|
1404
1639
|
lctx.use_buf(ctx0, 0);
|
@@ -1406,28 +1641,68 @@ static bool llama_eval_internal(
|
|
1406
1641
|
// used at the end to optionally extract the embeddings
|
1407
1642
|
struct ggml_tensor * embeddings = NULL;
|
1408
1643
|
|
1644
|
+
|
1409
1645
|
// norm
|
1410
1646
|
{
|
1647
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
1648
|
+
offload_func_nr(cur);
|
1649
|
+
ggml_set_name(cur, "rms_norm_inpL");
|
1411
1650
|
|
1412
|
-
|
1651
|
+
cur = ggml_rms_norm(ctx0, cur);
|
1652
|
+
offload_func_nr(cur);
|
1653
|
+
ggml_set_name(cur, "rms_norm_after");
|
1413
1654
|
|
1414
|
-
//
|
1415
|
-
|
1655
|
+
// cur = cur*norm(broadcasted)
|
1656
|
+
cur = ggml_mul(ctx0, cur, model.norm);
|
1657
|
+
offload_func_nr(cur);
|
1658
|
+
ggml_set_name(cur, "result_norm");
|
1416
1659
|
|
1417
|
-
embeddings =
|
1660
|
+
embeddings = cur;
|
1418
1661
|
}
|
1419
1662
|
|
1663
|
+
|
1420
1664
|
// lm_head
|
1421
|
-
|
1665
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
1666
|
+
ggml_set_name(cur, "result_output");
|
1422
1667
|
|
1423
1668
|
lctx.use_buf(ctx0, -1);
|
1424
1669
|
|
1425
1670
|
// logits -> probs
|
1426
|
-
//
|
1671
|
+
//cur = ggml_soft_max_inplace(ctx0, cur);
|
1427
1672
|
|
1428
1673
|
// run the computation
|
1429
|
-
ggml_build_forward_expand(&gf,
|
1430
|
-
|
1674
|
+
ggml_build_forward_expand(&gf, cur);
|
1675
|
+
|
1676
|
+
#ifdef GGML_USE_METAL
|
1677
|
+
if (lctx.ctx_metal && N == 1) {
|
1678
|
+
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
1679
|
+
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
1680
|
+
} else {
|
1681
|
+
// IMPORTANT:
|
1682
|
+
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
1683
|
+
// ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
|
1684
|
+
// coprocessor.
|
1685
|
+
//
|
1686
|
+
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
1687
|
+
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
1688
|
+
//
|
1689
|
+
// TODO: avoid these syncs via shared memory (ref #1696)
|
1690
|
+
//
|
1691
|
+
if (lctx.ctx_metal) {
|
1692
|
+
// We need to sync the GPU KV cache with the CPU KV cache
|
1693
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
1694
|
+
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
1695
|
+
}
|
1696
|
+
|
1697
|
+
ggml_graph_compute(ctx0, &gf);
|
1698
|
+
}
|
1699
|
+
#else
|
1700
|
+
ggml_graph_compute(ctx0, &gf);
|
1701
|
+
#endif
|
1702
|
+
|
1703
|
+
if (cgraph_fname) {
|
1704
|
+
ggml_graph_export(&gf, cgraph_fname);
|
1705
|
+
}
|
1431
1706
|
|
1432
1707
|
#ifdef GGML_PERF
|
1433
1708
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -1441,7 +1716,7 @@ static bool llama_eval_internal(
|
|
1441
1716
|
//}
|
1442
1717
|
|
1443
1718
|
//embd_w.resize(n_vocab*N);
|
1444
|
-
//memcpy(embd_w.data(), ggml_get_data(
|
1719
|
+
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1445
1720
|
|
1446
1721
|
// update kv token count
|
1447
1722
|
lctx.model.kv_self.n = n_past + N;
|
@@ -1452,11 +1727,11 @@ static bool llama_eval_internal(
|
|
1452
1727
|
|
1453
1728
|
if (lctx.logits_all) {
|
1454
1729
|
logits_out.resize(n_vocab * N);
|
1455
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1730
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1456
1731
|
} else {
|
1457
1732
|
// return result for just the last token
|
1458
1733
|
logits_out.resize(n_vocab);
|
1459
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(
|
1734
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
1460
1735
|
}
|
1461
1736
|
}
|
1462
1737
|
|
@@ -1987,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
1987
2262
|
return -log2f(candidate.p) > *mu;
|
1988
2263
|
}));
|
1989
2264
|
|
2265
|
+
if (candidates->size == 0) {
|
2266
|
+
candidates->size = 1;
|
2267
|
+
}
|
2268
|
+
|
1990
2269
|
// Normalize the probabilities of the remaining words
|
1991
2270
|
llama_sample_softmax(ctx, candidates);
|
1992
2271
|
|
@@ -2055,16 +2334,92 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
2055
2334
|
// quantization
|
2056
2335
|
//
|
2057
2336
|
|
2058
|
-
static void
|
2337
|
+
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
2338
|
+
if (output.size < nelements * sizeof(float)) {
|
2339
|
+
output.resize(nelements * sizeof(float));
|
2340
|
+
}
|
2341
|
+
float * f32_output = (float *) output.addr;
|
2342
|
+
|
2343
|
+
quantize_fns_t qtype;
|
2344
|
+
if (ggml_is_quantized(tensor.type)) {
|
2345
|
+
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
2346
|
+
if (qtype.dequantize_row_q == NULL) {
|
2347
|
+
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2348
|
+
}
|
2349
|
+
} else if (tensor.type != GGML_TYPE_F16) {
|
2350
|
+
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
2351
|
+
}
|
2352
|
+
|
2353
|
+
if (nthread < 2) {
|
2354
|
+
if (tensor.type == GGML_TYPE_F16) {
|
2355
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2356
|
+
} else if (ggml_is_quantized(tensor.type)) {
|
2357
|
+
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
2358
|
+
} else {
|
2359
|
+
LLAMA_ASSERT(false); // unreachable
|
2360
|
+
}
|
2361
|
+
return;
|
2362
|
+
}
|
2363
|
+
|
2364
|
+
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
2365
|
+
auto block_size_bytes = ggml_type_size(tensor.type);
|
2366
|
+
|
2367
|
+
LLAMA_ASSERT(nelements % block_size == 0);
|
2368
|
+
auto nblocks = nelements / block_size;
|
2369
|
+
auto blocks_per_thread = nblocks / nthread;
|
2370
|
+
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
2371
|
+
|
2372
|
+
std::vector<std::thread> workers;
|
2373
|
+
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
2374
|
+
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
2375
|
+
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
2376
|
+
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
2377
|
+
|
2378
|
+
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
2379
|
+
if (typ == GGML_TYPE_F16) {
|
2380
|
+
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2381
|
+
} else {
|
2382
|
+
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
2383
|
+
}
|
2384
|
+
};
|
2385
|
+
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
2386
|
+
in_buff_offs += thr_block_bytes;
|
2387
|
+
out_buff_offs += thr_elems;
|
2388
|
+
}
|
2389
|
+
for (auto & worker : workers) {
|
2390
|
+
worker.join();
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
}
|
2394
|
+
|
2395
|
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
2059
2396
|
ggml_type quantized_type;
|
2060
|
-
|
2397
|
+
llama_ftype ftype = params->ftype;
|
2398
|
+
int nthread = params->nthread;
|
2399
|
+
|
2400
|
+
switch (params->ftype) {
|
2061
2401
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
2062
2402
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
2063
2403
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2064
2404
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2065
2405
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2066
|
-
|
2067
|
-
|
2406
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2407
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2408
|
+
|
2409
|
+
#ifdef GGML_USE_K_QUANTS
|
2410
|
+
// K-quants
|
2411
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2412
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
2413
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
2414
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
2415
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
2416
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
2417
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2418
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2419
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2420
|
+
#endif
|
2421
|
+
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2422
|
+
}
|
2068
2423
|
|
2069
2424
|
if (nthread <= 0) {
|
2070
2425
|
nthread = std::thread::hardware_concurrency();
|
@@ -2072,7 +2427,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2072
2427
|
|
2073
2428
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
2074
2429
|
/*vocab_only*/ false));
|
2075
|
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
2430
|
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2431
|
+
|
2432
|
+
#ifdef GGML_USE_K_QUANTS
|
2433
|
+
int n_attention_wv = 0;
|
2434
|
+
int n_feed_forward_w2 = 0;
|
2435
|
+
for (auto& tensor : model_loader->tensors_map.tensors) {
|
2436
|
+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2437
|
+
++n_attention_wv;
|
2438
|
+
}
|
2439
|
+
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2440
|
+
++n_feed_forward_w2;
|
2441
|
+
}
|
2442
|
+
}
|
2443
|
+
|
2444
|
+
int i_attention_wv = 0;
|
2445
|
+
int i_feed_forward_w2 = 0;
|
2446
|
+
#endif
|
2076
2447
|
|
2077
2448
|
size_t total_size_org = 0;
|
2078
2449
|
size_t total_size_new = 0;
|
@@ -2098,11 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2098
2469
|
|
2099
2470
|
// quantize only 2D tensors
|
2100
2471
|
quantize &= (tensor.ne.size() == 2);
|
2101
|
-
|
2102
|
-
|
2103
|
-
//if (tensor.name == "output.weight") {
|
2104
|
-
// quantize = false;
|
2105
|
-
//}
|
2472
|
+
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
2473
|
+
quantize &= quantized_type != tensor.type;
|
2106
2474
|
|
2107
2475
|
enum ggml_type new_type;
|
2108
2476
|
void * new_data;
|
@@ -2116,20 +2484,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2116
2484
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2117
2485
|
} else {
|
2118
2486
|
new_type = quantized_type;
|
2487
|
+
#ifdef GGML_USE_K_QUANTS
|
2488
|
+
if (tensor.name == "output.weight") {
|
2489
|
+
new_type = GGML_TYPE_Q6_K;
|
2490
|
+
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2491
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2492
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2493
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2494
|
+
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2495
|
+
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2496
|
+
++i_attention_wv;
|
2497
|
+
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2498
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2499
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2500
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2501
|
+
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2502
|
+
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2503
|
+
++i_feed_forward_w2;
|
2504
|
+
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2505
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2506
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2507
|
+
}
|
2508
|
+
#endif
|
2509
|
+
|
2119
2510
|
float * f32_data;
|
2120
2511
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
2121
2512
|
llama_buffer f32_conv_buf;
|
2513
|
+
|
2122
2514
|
if (tensor.type == GGML_TYPE_F32) {
|
2123
2515
|
f32_data = (float *) tensor.data;
|
2124
|
-
} else if (tensor.type
|
2125
|
-
|
2126
|
-
f32_data = (float *) f32_conv_buf.addr;
|
2127
|
-
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
2128
|
-
for (size_t i = 0; i < nelements; i++) {
|
2129
|
-
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
2130
|
-
}
|
2516
|
+
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
2517
|
+
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
2131
2518
|
} else {
|
2132
|
-
|
2519
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
2520
|
+
f32_data = (float *) f32_conv_buf.addr;
|
2133
2521
|
}
|
2134
2522
|
|
2135
2523
|
printf("quantizing .. ");
|
@@ -2183,12 +2571,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2183
2571
|
}
|
2184
2572
|
|
2185
2573
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
2574
|
+
int64_t tot_count = 0;
|
2186
2575
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2187
2576
|
hist_all[i] += hist_cur[i];
|
2577
|
+
tot_count += hist_cur[i];
|
2188
2578
|
}
|
2189
2579
|
|
2190
|
-
|
2191
|
-
|
2580
|
+
if (tot_count > 0) {
|
2581
|
+
for (size_t i = 0; i < hist_cur.size(); i++) {
|
2582
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
2583
|
+
}
|
2192
2584
|
}
|
2193
2585
|
printf("\n");
|
2194
2586
|
}
|
@@ -2206,11 +2598,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2206
2598
|
sum_all += hist_all[i];
|
2207
2599
|
}
|
2208
2600
|
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2601
|
+
if (sum_all > 0) {
|
2602
|
+
printf("%s: hist: ", __func__);
|
2603
|
+
for (size_t i = 0; i < hist_all.size(); i++) {
|
2604
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
2605
|
+
}
|
2606
|
+
printf("\n");
|
2212
2607
|
}
|
2213
|
-
printf("\n");
|
2214
2608
|
}
|
2215
2609
|
}
|
2216
2610
|
|
@@ -2251,9 +2645,9 @@ struct llama_context * llama_init_from_file(
|
|
2251
2645
|
|
2252
2646
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2253
2647
|
|
2254
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers,
|
2255
|
-
|
2256
|
-
|
2648
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2649
|
+
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2650
|
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2257
2651
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2258
2652
|
llama_free(ctx);
|
2259
2653
|
return nullptr;
|
@@ -2261,7 +2655,7 @@ struct llama_context * llama_init_from_file(
|
|
2261
2655
|
|
2262
2656
|
// reserve memory for context buffers
|
2263
2657
|
if (!params.vocab_only) {
|
2264
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
2658
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2265
2659
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2266
2660
|
llama_free(ctx);
|
2267
2661
|
return nullptr;
|
@@ -2291,6 +2685,38 @@ struct llama_context * llama_init_from_file(
|
|
2291
2685
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2292
2686
|
}
|
2293
2687
|
|
2688
|
+
#ifdef GGML_USE_METAL
|
2689
|
+
if (params.n_gpu_layers > 0) {
|
2690
|
+
// this allocates all Metal resources and memory buffers
|
2691
|
+
ctx->ctx_metal = ggml_metal_init();
|
2692
|
+
|
2693
|
+
void *data_ptr = NULL;
|
2694
|
+
size_t data_size = 0;
|
2695
|
+
if (params.use_mmap) {
|
2696
|
+
data_ptr = ctx->model.mapping->addr;
|
2697
|
+
data_size= ctx->model.mapping->size;
|
2698
|
+
} else {
|
2699
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2700
|
+
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2701
|
+
}
|
2702
|
+
|
2703
|
+
#define LLAMA_METAL_CHECK_BUF(result) \
|
2704
|
+
if (!(result)) { \
|
2705
|
+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
2706
|
+
llama_free(ctx); \
|
2707
|
+
return NULL; \
|
2708
|
+
}
|
2709
|
+
|
2710
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2711
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2712
|
+
|
2713
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
2714
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
2715
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2716
|
+
#undef LLAMA_METAL_CHECK_BUF
|
2717
|
+
}
|
2718
|
+
#endif
|
2719
|
+
|
2294
2720
|
return ctx;
|
2295
2721
|
}
|
2296
2722
|
|
@@ -2301,13 +2727,12 @@ void llama_free(struct llama_context * ctx) {
|
|
2301
2727
|
int llama_model_quantize(
|
2302
2728
|
const char * fname_inp,
|
2303
2729
|
const char * fname_out,
|
2304
|
-
|
2305
|
-
int nthread) {
|
2730
|
+
const llama_model_quantize_params *params) {
|
2306
2731
|
try {
|
2307
|
-
llama_model_quantize_internal(fname_inp, fname_out,
|
2732
|
+
llama_model_quantize_internal(fname_inp, fname_out, params);
|
2308
2733
|
return 0;
|
2309
|
-
} catch (const std::
|
2310
|
-
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.
|
2734
|
+
} catch (const std::exception & err) {
|
2735
|
+
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
2311
2736
|
return 1;
|
2312
2737
|
}
|
2313
2738
|
}
|
@@ -2560,8 +2985,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2560
2985
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2561
2986
|
try {
|
2562
2987
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
2563
|
-
} catch (const std::
|
2564
|
-
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.
|
2988
|
+
} catch (const std::exception & err) {
|
2989
|
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2565
2990
|
return 1;
|
2566
2991
|
}
|
2567
2992
|
}
|
@@ -2906,7 +3331,7 @@ int llama_eval(
|
|
2906
3331
|
int n_tokens,
|
2907
3332
|
int n_past,
|
2908
3333
|
int n_threads) {
|
2909
|
-
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
3334
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
2910
3335
|
fprintf(stderr, "%s: failed to eval\n", __func__);
|
2911
3336
|
return 1;
|
2912
3337
|
}
|
@@ -2921,6 +3346,20 @@ int llama_eval(
|
|
2921
3346
|
return 0;
|
2922
3347
|
}
|
2923
3348
|
|
3349
|
+
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
3350
|
+
const int n_batch = 1;
|
3351
|
+
const int n_ctx = 512 - n_batch;
|
3352
|
+
|
3353
|
+
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
3354
|
+
|
3355
|
+
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
3356
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
3357
|
+
return 1;
|
3358
|
+
}
|
3359
|
+
|
3360
|
+
return 0;
|
3361
|
+
}
|
3362
|
+
|
2924
3363
|
int llama_tokenize(
|
2925
3364
|
struct llama_context * ctx,
|
2926
3365
|
const char * text,
|
@@ -2953,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
2953
3392
|
return ctx->model.hparams.n_embd;
|
2954
3393
|
}
|
2955
3394
|
|
3395
|
+
int llama_get_vocab(
|
3396
|
+
const struct llama_context * ctx,
|
3397
|
+
const char * * strings,
|
3398
|
+
float * scores,
|
3399
|
+
int capacity) {
|
3400
|
+
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
3401
|
+
for (int i = 0; i<n; ++i) {
|
3402
|
+
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
3403
|
+
scores[i] = ctx->vocab.id_to_token[i].score;
|
3404
|
+
}
|
3405
|
+
return n;
|
3406
|
+
}
|
3407
|
+
|
2956
3408
|
float * llama_get_logits(struct llama_context * ctx) {
|
2957
3409
|
return ctx->logits.data();
|
2958
3410
|
}
|