llama_cpp 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +8 -2
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +1034 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +398 -184
- data/ext/llama_cpp/src/ggml.h +14 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +191 -92
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -361
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstddef>
|
4
5
|
#include <cstdint>
|
5
6
|
#include <cstdio>
|
6
7
|
#endif
|
@@ -11,6 +12,8 @@
|
|
11
12
|
#include "ggml.h"
|
12
13
|
#ifdef GGML_USE_CUBLAS
|
13
14
|
#include "ggml-cuda.h"
|
15
|
+
#elif defined(GGML_USE_CLBLAST)
|
16
|
+
#include "ggml-opencl.h"
|
14
17
|
#endif
|
15
18
|
|
16
19
|
#include <array>
|
@@ -45,6 +48,7 @@ enum e_model {
|
|
45
48
|
MODEL_65B,
|
46
49
|
};
|
47
50
|
|
51
|
+
|
48
52
|
static const size_t MB = 1024*1024;
|
49
53
|
|
50
54
|
// computed for n_ctx == 2048
|
@@ -110,7 +114,7 @@ struct llama_hparams {
|
|
110
114
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
111
115
|
|
112
116
|
bool operator!=(const llama_hparams & other) const {
|
113
|
-
return memcmp(this, &other, sizeof(llama_hparams));
|
117
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
114
118
|
}
|
115
119
|
};
|
116
120
|
|
@@ -406,6 +410,7 @@ enum llama_file_version {
|
|
406
410
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
407
411
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
408
412
|
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
413
|
+
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
409
414
|
};
|
410
415
|
|
411
416
|
struct llama_file_loader {
|
@@ -424,24 +429,30 @@ struct llama_file_loader {
|
|
424
429
|
}
|
425
430
|
void read_magic() {
|
426
431
|
uint32_t magic = file.read_u32();
|
427
|
-
uint32_t version = 0;
|
428
432
|
|
429
|
-
if (magic
|
430
|
-
|
433
|
+
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
434
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
435
|
+
return;
|
431
436
|
}
|
432
437
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
438
|
+
uint32_t version = file.read_u32();
|
439
|
+
|
440
|
+
switch (magic) {
|
441
|
+
case LLAMA_FILE_MAGIC_GGMF:
|
442
|
+
switch (version) {
|
443
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
444
|
+
}
|
445
|
+
break;
|
446
|
+
case LLAMA_FILE_MAGIC_GGJT:
|
447
|
+
switch (version) {
|
448
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
449
|
+
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
450
|
+
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
451
|
+
}
|
444
452
|
}
|
453
|
+
|
454
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
455
|
+
magic, version);
|
445
456
|
}
|
446
457
|
void read_hparams() {
|
447
458
|
hparams.n_vocab = file.read_u32();
|
@@ -499,7 +510,7 @@ struct llama_file_loader {
|
|
499
510
|
|
500
511
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
501
512
|
// skip to the next multiple of 32 bytes
|
502
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
513
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
503
514
|
}
|
504
515
|
shard.file_idx = file_idx;
|
505
516
|
shard.file_off = file.tell();
|
@@ -574,7 +585,7 @@ struct llama_file_saver {
|
|
574
585
|
file.write_u32(new_type);
|
575
586
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
576
587
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
577
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
588
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
578
589
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
579
590
|
file.write_raw(new_data, new_size);
|
580
591
|
}
|
@@ -641,7 +652,7 @@ struct llama_model_loader {
|
|
641
652
|
}
|
642
653
|
}
|
643
654
|
|
644
|
-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
655
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
645
656
|
auto it = tensors_map.name_to_idx.find(name);
|
646
657
|
if (it == tensors_map.name_to_idx.end()) {
|
647
658
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -652,10 +663,10 @@ struct llama_model_loader {
|
|
652
663
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
653
664
|
}
|
654
665
|
|
655
|
-
return get_tensor_for(lt);
|
666
|
+
return get_tensor_for(lt, backend);
|
656
667
|
}
|
657
668
|
|
658
|
-
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
669
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
659
670
|
struct ggml_tensor * tensor;
|
660
671
|
if (lt.ne.size() == 2) {
|
661
672
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
@@ -665,6 +676,7 @@ struct llama_model_loader {
|
|
665
676
|
}
|
666
677
|
ggml_set_name(tensor, lt.name.c_str());
|
667
678
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
679
|
+
tensor->backend = backend;
|
668
680
|
lt.ggml_tensor = tensor;
|
669
681
|
num_ggml_tensors_created++;
|
670
682
|
return tensor;
|
@@ -678,12 +690,16 @@ struct llama_model_loader {
|
|
678
690
|
|
679
691
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
680
692
|
size_t data_size = 0;
|
693
|
+
size_t prefetch_size = 0;
|
681
694
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
682
695
|
data_size += lt.size;
|
696
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
697
|
+
prefetch_size += lt.size;
|
698
|
+
}
|
683
699
|
}
|
684
700
|
|
685
701
|
if (use_mmap) {
|
686
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
702
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
687
703
|
if (!lmlock) {
|
688
704
|
// Don't call the callback since the actual loading will be lazy
|
689
705
|
// and we can't measure it.
|
@@ -696,6 +712,9 @@ struct llama_model_loader {
|
|
696
712
|
|
697
713
|
size_t done_size = 0;
|
698
714
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
715
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
716
|
+
continue;
|
717
|
+
}
|
699
718
|
if (progress_callback) {
|
700
719
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
701
720
|
}
|
@@ -708,9 +727,6 @@ struct llama_model_loader {
|
|
708
727
|
lmlock->grow_to(done_size);
|
709
728
|
}
|
710
729
|
}
|
711
|
-
if (progress_callback) {
|
712
|
-
progress_callback(1.0f, progress_callback_user_data);
|
713
|
-
}
|
714
730
|
}
|
715
731
|
|
716
732
|
void load_data_for(llama_load_tensor & lt) {
|
@@ -835,6 +851,21 @@ bool llama_mlock_supported() {
|
|
835
851
|
return llama_mlock::SUPPORTED;
|
836
852
|
}
|
837
853
|
|
854
|
+
void llama_init_backend() {
|
855
|
+
ggml_time_init();
|
856
|
+
|
857
|
+
// needed to initialize f16 tables
|
858
|
+
{
|
859
|
+
struct ggml_init_params params = { 0, NULL, false };
|
860
|
+
struct ggml_context * ctx = ggml_init(params);
|
861
|
+
ggml_free(ctx);
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
int64_t llama_time_us() {
|
866
|
+
return ggml_time_us();
|
867
|
+
}
|
868
|
+
|
838
869
|
//
|
839
870
|
// model loading
|
840
871
|
//
|
@@ -844,7 +875,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
844
875
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
845
876
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
846
877
|
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
847
|
-
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (
|
878
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
879
|
+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
848
880
|
}
|
849
881
|
|
850
882
|
return "unknown";
|
@@ -924,11 +956,19 @@ static void llama_model_load_internal(
|
|
924
956
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
925
957
|
}
|
926
958
|
|
927
|
-
if (file_version
|
959
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
928
960
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
929
961
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
930
962
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
931
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/
|
963
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
964
|
+
}
|
965
|
+
}
|
966
|
+
|
967
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
968
|
+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
969
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
970
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
971
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
932
972
|
}
|
933
973
|
}
|
934
974
|
|
@@ -941,27 +981,7 @@ static void llama_model_load_internal(
|
|
941
981
|
size_t ctx_size;
|
942
982
|
size_t mmapped_size;
|
943
983
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
944
|
-
fprintf(stderr, "%s: ggml ctx size = %
|
945
|
-
|
946
|
-
// print memory requirements
|
947
|
-
{
|
948
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
949
|
-
|
950
|
-
// this is the total memory required to run the inference
|
951
|
-
const size_t mem_required =
|
952
|
-
ctx_size +
|
953
|
-
mmapped_size +
|
954
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
955
|
-
MEM_REQ_SCRATCH1().at(model.type) +
|
956
|
-
MEM_REQ_EVAL().at(model.type);
|
957
|
-
|
958
|
-
// this is the memory required by one llama_state
|
959
|
-
const size_t mem_required_state =
|
960
|
-
scale*MEM_REQ_KV_SELF().at(model.type);
|
961
|
-
|
962
|
-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
963
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
964
|
-
}
|
984
|
+
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
965
985
|
|
966
986
|
// create the ggml context
|
967
987
|
{
|
@@ -983,7 +1003,14 @@ static void llama_model_load_internal(
|
|
983
1003
|
}
|
984
1004
|
}
|
985
1005
|
|
1006
|
+
#ifdef GGML_USE_CUBLAS
|
1007
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
1008
|
+
#else
|
1009
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1010
|
+
#endif
|
1011
|
+
|
986
1012
|
// prepare memory for the weights
|
1013
|
+
size_t vram_total = 0;
|
987
1014
|
{
|
988
1015
|
const uint32_t n_embd = hparams.n_embd;
|
989
1016
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -991,33 +1018,87 @@ static void llama_model_load_internal(
|
|
991
1018
|
|
992
1019
|
ml->ggml_ctx = ctx;
|
993
1020
|
|
994
|
-
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
995
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
996
|
-
|
1021
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1022
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1023
|
+
|
1024
|
+
// "output" tensor
|
1025
|
+
{
|
1026
|
+
ggml_backend backend_output;
|
1027
|
+
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1028
|
+
backend_output = LLAMA_BACKEND_OFFLOAD;
|
1029
|
+
} else {
|
1030
|
+
backend_output = GGML_BACKEND_CPU;
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
997
1037
|
|
998
1038
|
model.layers.resize(n_layer);
|
999
1039
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1040
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1041
|
+
|
1000
1042
|
auto & layer = model.layers[i];
|
1001
1043
|
|
1002
1044
|
std::string layers_i = "layers." + std::to_string(i);
|
1003
1045
|
|
1004
|
-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
1046
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1047
|
+
|
1048
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
1049
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
1050
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
1051
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
1005
1052
|
|
1006
|
-
layer.
|
1007
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
1008
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
1009
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
1053
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1010
1054
|
|
1011
|
-
layer.
|
1055
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
1056
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
1057
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
1012
1058
|
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1059
|
+
if (backend == GGML_BACKEND_CUDA) {
|
1060
|
+
vram_total +=
|
1061
|
+
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1062
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1063
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1064
|
+
}
|
1016
1065
|
}
|
1017
1066
|
}
|
1018
1067
|
|
1019
1068
|
ml->done_getting_tensors();
|
1020
1069
|
|
1070
|
+
// print memory requirements
|
1071
|
+
{
|
1072
|
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1073
|
+
|
1074
|
+
// this is the total memory required to run the inference
|
1075
|
+
const size_t mem_required =
|
1076
|
+
ctx_size +
|
1077
|
+
mmapped_size - vram_total + // weights in VRAM not in memory
|
1078
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
1079
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
1080
|
+
MEM_REQ_EVAL().at(model.type);
|
1081
|
+
|
1082
|
+
// this is the memory required by one llama_state
|
1083
|
+
const size_t mem_required_state =
|
1084
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
1085
|
+
|
1086
|
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1087
|
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1088
|
+
|
1089
|
+
#ifdef GGML_USE_CUBLAS
|
1090
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1091
|
+
|
1092
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1093
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1094
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1095
|
+
}
|
1096
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1097
|
+
#elif !defined(GGML_USE_CLBLAST)
|
1098
|
+
(void) n_gpu_layers;
|
1099
|
+
#endif
|
1100
|
+
}
|
1101
|
+
|
1021
1102
|
// populate `tensors_by_name`
|
1022
1103
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1023
1104
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
@@ -1025,37 +1106,61 @@ static void llama_model_load_internal(
|
|
1025
1106
|
|
1026
1107
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1027
1108
|
|
1028
|
-
model.mapping = std::move(ml->mapping);
|
1029
1109
|
#ifdef GGML_USE_CUBLAS
|
1110
|
+
{
|
1111
|
+
size_t done_size = 0;
|
1112
|
+
size_t data_size = 0;
|
1113
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1114
|
+
data_size += lt.size;
|
1115
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1116
|
+
done_size += lt.size;
|
1117
|
+
}
|
1118
|
+
}
|
1119
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1120
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1121
|
+
continue;
|
1122
|
+
}
|
1123
|
+
if (progress_callback) {
|
1124
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1125
|
+
}
|
1126
|
+
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1127
|
+
done_size += lt.size;
|
1128
|
+
}
|
1129
|
+
}
|
1130
|
+
#elif defined(GGML_USE_CLBLAST)
|
1030
1131
|
{
|
1031
1132
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1032
1133
|
|
1033
|
-
fprintf(stderr, "
|
1134
|
+
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1034
1135
|
|
1035
1136
|
size_t vram_total = 0;
|
1036
1137
|
|
1037
1138
|
for (int i = 0; i < n_gpu; ++i) {
|
1038
1139
|
const auto & layer = model.layers[i];
|
1039
1140
|
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1141
|
+
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1142
|
+
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1143
|
+
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1144
|
+
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1145
|
+
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1146
|
+
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1147
|
+
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1047
1148
|
}
|
1048
1149
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1049
|
-
fprintf(stderr, "
|
1050
|
-
|
1150
|
+
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1151
|
+
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1051
1152
|
}
|
1052
1153
|
|
1053
|
-
fprintf(stderr, "
|
1154
|
+
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1054
1155
|
}
|
1055
|
-
#else
|
1056
|
-
(void) n_gpu_layers;
|
1057
1156
|
#endif
|
1058
1157
|
|
1158
|
+
if (progress_callback) {
|
1159
|
+
progress_callback(1.0f, progress_callback_user_data);
|
1160
|
+
}
|
1161
|
+
|
1162
|
+
model.mapping = std::move(ml->mapping);
|
1163
|
+
|
1059
1164
|
// loading time will be recalculate after the first eval, so
|
1060
1165
|
// we take page faults deferred by mmap() into consideration
|
1061
1166
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
@@ -1153,10 +1258,8 @@ static bool llama_eval_internal(
|
|
1153
1258
|
{
|
1154
1259
|
cur = ggml_rms_norm(ctx0, inpL);
|
1155
1260
|
|
1156
|
-
// cur = attention_norm
|
1157
|
-
cur = ggml_mul(ctx0,
|
1158
|
-
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
1159
|
-
cur);
|
1261
|
+
// cur = cur*attention_norm(broadcasted)
|
1262
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1160
1263
|
}
|
1161
1264
|
|
1162
1265
|
// self-attention
|
@@ -1263,10 +1366,8 @@ static bool llama_eval_internal(
|
|
1263
1366
|
{
|
1264
1367
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1265
1368
|
|
1266
|
-
// cur = ffn_norm
|
1267
|
-
cur = ggml_mul(ctx0,
|
1268
|
-
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
1269
|
-
cur);
|
1369
|
+
// cur = cur*ffn_norm(broadcasted)
|
1370
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1270
1371
|
}
|
1271
1372
|
|
1272
1373
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
@@ -1303,10 +1404,8 @@ static bool llama_eval_internal(
|
|
1303
1404
|
|
1304
1405
|
inpL = ggml_rms_norm(ctx0, inpL);
|
1305
1406
|
|
1306
|
-
// inpL = norm
|
1307
|
-
inpL = ggml_mul(ctx0,
|
1308
|
-
ggml_repeat(ctx0, model.norm, inpL),
|
1309
|
-
inpL);
|
1407
|
+
// inpL = inpL*norm(broadcasted)
|
1408
|
+
inpL = ggml_mul(ctx0, inpL, model.norm);
|
1310
1409
|
|
1311
1410
|
embeddings = inpL;
|
1312
1411
|
}
|
@@ -2130,7 +2229,7 @@ struct llama_context * llama_init_from_file(
|
|
2130
2229
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
2131
2230
|
unsigned percentage = (unsigned) (100 * progress);
|
2132
2231
|
while (percentage > *cur_percentage_p) {
|
2133
|
-
|
2232
|
+
*cur_percentage_p = percentage;
|
2134
2233
|
fprintf(stderr, ".");
|
2135
2234
|
fflush(stderr);
|
2136
2235
|
if (percentage >= 100) {
|
@@ -2223,7 +2322,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2223
2322
|
{
|
2224
2323
|
uint32_t magic;
|
2225
2324
|
fin.read((char *) &magic, sizeof(magic));
|
2226
|
-
if (magic !=
|
2325
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
2227
2326
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
2228
2327
|
return 1;
|
2229
2328
|
}
|
@@ -2287,7 +2386,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2287
2386
|
|
2288
2387
|
// maybe this should in llama_model_loader
|
2289
2388
|
if (model_loader->use_mmap) {
|
2290
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */
|
2389
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2291
2390
|
}
|
2292
2391
|
}
|
2293
2392
|
|
@@ -2380,7 +2479,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2380
2479
|
}
|
2381
2480
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
2382
2481
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
2383
|
-
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
2482
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
2384
2483
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
2385
2484
|
model_loader->load_data_for(lt);
|
2386
2485
|
lt.ggml_tensor->data = lt.data;
|
@@ -2606,8 +2705,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
2606
2705
|
}
|
2607
2706
|
|
2608
2707
|
// Sets the state reading from the specified source address
|
2609
|
-
size_t llama_set_state_data(struct llama_context * ctx,
|
2610
|
-
|
2708
|
+
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
2709
|
+
uint8_t * inp = src;
|
2611
2710
|
|
2612
2711
|
// set rng
|
2613
2712
|
{
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,10 +19,16 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
23
|
-
#define
|
24
|
-
#define
|
25
|
-
#define
|
22
|
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
23
|
+
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
24
|
+
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
25
|
+
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
26
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
27
|
+
|
28
|
+
#define LLAMA_FILE_VERSION 3
|
29
|
+
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
30
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
31
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
26
32
|
#define LLAMA_SESSION_VERSION 1
|
27
33
|
|
28
34
|
#ifdef __cplusplus
|
@@ -40,9 +46,9 @@ extern "C" {
|
|
40
46
|
typedef int llama_token;
|
41
47
|
|
42
48
|
typedef struct llama_token_data {
|
43
|
-
llama_token id;
|
44
|
-
float logit;
|
45
|
-
float p;
|
49
|
+
llama_token id; // token id
|
50
|
+
float logit; // log-odds of the token
|
51
|
+
float p; // probability of the token
|
46
52
|
} llama_token_data;
|
47
53
|
|
48
54
|
typedef struct llama_token_data_array {
|
@@ -73,16 +79,16 @@ extern "C" {
|
|
73
79
|
|
74
80
|
// model file types
|
75
81
|
enum llama_ftype {
|
76
|
-
LLAMA_FTYPE_ALL_F32
|
77
|
-
LLAMA_FTYPE_MOSTLY_F16
|
78
|
-
LLAMA_FTYPE_MOSTLY_Q4_0
|
79
|
-
LLAMA_FTYPE_MOSTLY_Q4_1
|
82
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
83
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
84
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
85
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
80
86
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
81
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2
|
82
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3
|
83
|
-
LLAMA_FTYPE_MOSTLY_Q8_0
|
84
|
-
LLAMA_FTYPE_MOSTLY_Q5_0
|
85
|
-
LLAMA_FTYPE_MOSTLY_Q5_1
|
87
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
88
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
86
92
|
};
|
87
93
|
|
88
94
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -90,6 +96,13 @@ extern "C" {
|
|
90
96
|
LLAMA_API bool llama_mmap_supported();
|
91
97
|
LLAMA_API bool llama_mlock_supported();
|
92
98
|
|
99
|
+
// TODO: not great API - very likely to change
|
100
|
+
// Initialize the llama + ggml backend
|
101
|
+
// Call once at the start of the program
|
102
|
+
LLAMA_API void llama_init_backend();
|
103
|
+
|
104
|
+
LLAMA_API int64_t llama_time_us();
|
105
|
+
|
93
106
|
// Various functions for loading a ggml llama model.
|
94
107
|
// Allocate (almost) all memory needed for the model.
|
95
108
|
// Return NULL on failure
|
@@ -138,7 +151,7 @@ extern "C" {
|
|
138
151
|
|
139
152
|
// Set the state reading from the specified address
|
140
153
|
// Returns the number of bytes read
|
141
|
-
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx,
|
154
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
142
155
|
|
143
156
|
// Save/load session file
|
144
157
|
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-66874d4'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -14,6 +14,7 @@ module LLaMACpp
|
|
14
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
15
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
16
16
|
|
17
|
+
def self?.init_backend: () -> void
|
17
18
|
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
18
19
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
19
20
|
def self?.print_system_info: () -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,7 +27,7 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
-
- ext/llama_cpp/src/ggml-opencl.
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|