llama_cpp 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +8 -2
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +1034 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +398 -184
- data/ext/llama_cpp/src/ggml.h +14 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +191 -92
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -361
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
+
#include <cstddef>
|
4
5
|
#include <cstdint>
|
5
6
|
#include <cstdio>
|
6
7
|
#endif
|
@@ -11,6 +12,8 @@
|
|
11
12
|
#include "ggml.h"
|
12
13
|
#ifdef GGML_USE_CUBLAS
|
13
14
|
#include "ggml-cuda.h"
|
15
|
+
#elif defined(GGML_USE_CLBLAST)
|
16
|
+
#include "ggml-opencl.h"
|
14
17
|
#endif
|
15
18
|
|
16
19
|
#include <array>
|
@@ -45,6 +48,7 @@ enum e_model {
|
|
45
48
|
MODEL_65B,
|
46
49
|
};
|
47
50
|
|
51
|
+
|
48
52
|
static const size_t MB = 1024*1024;
|
49
53
|
|
50
54
|
// computed for n_ctx == 2048
|
@@ -110,7 +114,7 @@ struct llama_hparams {
|
|
110
114
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
111
115
|
|
112
116
|
bool operator!=(const llama_hparams & other) const {
|
113
|
-
return memcmp(this, &other, sizeof(llama_hparams));
|
117
|
+
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
114
118
|
}
|
115
119
|
};
|
116
120
|
|
@@ -406,6 +410,7 @@ enum llama_file_version {
|
|
406
410
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
407
411
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
408
412
|
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
413
|
+
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
409
414
|
};
|
410
415
|
|
411
416
|
struct llama_file_loader {
|
@@ -424,24 +429,30 @@ struct llama_file_loader {
|
|
424
429
|
}
|
425
430
|
void read_magic() {
|
426
431
|
uint32_t magic = file.read_u32();
|
427
|
-
uint32_t version = 0;
|
428
432
|
|
429
|
-
if (magic
|
430
|
-
|
433
|
+
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
434
|
+
file_version = LLAMA_FILE_VERSION_GGML;
|
435
|
+
return;
|
431
436
|
}
|
432
437
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
438
|
+
uint32_t version = file.read_u32();
|
439
|
+
|
440
|
+
switch (magic) {
|
441
|
+
case LLAMA_FILE_MAGIC_GGMF:
|
442
|
+
switch (version) {
|
443
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
444
|
+
}
|
445
|
+
break;
|
446
|
+
case LLAMA_FILE_MAGIC_GGJT:
|
447
|
+
switch (version) {
|
448
|
+
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
449
|
+
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
450
|
+
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
451
|
+
}
|
444
452
|
}
|
453
|
+
|
454
|
+
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
455
|
+
magic, version);
|
445
456
|
}
|
446
457
|
void read_hparams() {
|
447
458
|
hparams.n_vocab = file.read_u32();
|
@@ -499,7 +510,7 @@ struct llama_file_loader {
|
|
499
510
|
|
500
511
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
501
512
|
// skip to the next multiple of 32 bytes
|
502
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
513
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
503
514
|
}
|
504
515
|
shard.file_idx = file_idx;
|
505
516
|
shard.file_off = file.tell();
|
@@ -574,7 +585,7 @@ struct llama_file_saver {
|
|
574
585
|
file.write_u32(new_type);
|
575
586
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
576
587
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
577
|
-
file.seek(-file.tell() & 31, SEEK_CUR);
|
588
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
578
589
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
579
590
|
file.write_raw(new_data, new_size);
|
580
591
|
}
|
@@ -641,7 +652,7 @@ struct llama_model_loader {
|
|
641
652
|
}
|
642
653
|
}
|
643
654
|
|
644
|
-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
655
|
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
645
656
|
auto it = tensors_map.name_to_idx.find(name);
|
646
657
|
if (it == tensors_map.name_to_idx.end()) {
|
647
658
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
@@ -652,10 +663,10 @@ struct llama_model_loader {
|
|
652
663
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
653
664
|
}
|
654
665
|
|
655
|
-
return get_tensor_for(lt);
|
666
|
+
return get_tensor_for(lt, backend);
|
656
667
|
}
|
657
668
|
|
658
|
-
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
669
|
+
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
659
670
|
struct ggml_tensor * tensor;
|
660
671
|
if (lt.ne.size() == 2) {
|
661
672
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
@@ -665,6 +676,7 @@ struct llama_model_loader {
|
|
665
676
|
}
|
666
677
|
ggml_set_name(tensor, lt.name.c_str());
|
667
678
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
679
|
+
tensor->backend = backend;
|
668
680
|
lt.ggml_tensor = tensor;
|
669
681
|
num_ggml_tensors_created++;
|
670
682
|
return tensor;
|
@@ -678,12 +690,16 @@ struct llama_model_loader {
|
|
678
690
|
|
679
691
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
680
692
|
size_t data_size = 0;
|
693
|
+
size_t prefetch_size = 0;
|
681
694
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
682
695
|
data_size += lt.size;
|
696
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
697
|
+
prefetch_size += lt.size;
|
698
|
+
}
|
683
699
|
}
|
684
700
|
|
685
701
|
if (use_mmap) {
|
686
|
-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
702
|
+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
687
703
|
if (!lmlock) {
|
688
704
|
// Don't call the callback since the actual loading will be lazy
|
689
705
|
// and we can't measure it.
|
@@ -696,6 +712,9 @@ struct llama_model_loader {
|
|
696
712
|
|
697
713
|
size_t done_size = 0;
|
698
714
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
715
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
716
|
+
continue;
|
717
|
+
}
|
699
718
|
if (progress_callback) {
|
700
719
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
701
720
|
}
|
@@ -708,9 +727,6 @@ struct llama_model_loader {
|
|
708
727
|
lmlock->grow_to(done_size);
|
709
728
|
}
|
710
729
|
}
|
711
|
-
if (progress_callback) {
|
712
|
-
progress_callback(1.0f, progress_callback_user_data);
|
713
|
-
}
|
714
730
|
}
|
715
731
|
|
716
732
|
void load_data_for(llama_load_tensor & lt) {
|
@@ -835,6 +851,21 @@ bool llama_mlock_supported() {
|
|
835
851
|
return llama_mlock::SUPPORTED;
|
836
852
|
}
|
837
853
|
|
854
|
+
void llama_init_backend() {
|
855
|
+
ggml_time_init();
|
856
|
+
|
857
|
+
// needed to initialize f16 tables
|
858
|
+
{
|
859
|
+
struct ggml_init_params params = { 0, NULL, false };
|
860
|
+
struct ggml_context * ctx = ggml_init(params);
|
861
|
+
ggml_free(ctx);
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
int64_t llama_time_us() {
|
866
|
+
return ggml_time_us();
|
867
|
+
}
|
868
|
+
|
838
869
|
//
|
839
870
|
// model loading
|
840
871
|
//
|
@@ -844,7 +875,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
844
875
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
845
876
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
846
877
|
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
847
|
-
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (
|
878
|
+
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
879
|
+
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
848
880
|
}
|
849
881
|
|
850
882
|
return "unknown";
|
@@ -924,11 +956,19 @@ static void llama_model_load_internal(
|
|
924
956
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
925
957
|
}
|
926
958
|
|
927
|
-
if (file_version
|
959
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
928
960
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
929
961
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
930
962
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
931
|
-
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/
|
963
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
964
|
+
}
|
965
|
+
}
|
966
|
+
|
967
|
+
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
968
|
+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
969
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
970
|
+
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
971
|
+
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
932
972
|
}
|
933
973
|
}
|
934
974
|
|
@@ -941,27 +981,7 @@ static void llama_model_load_internal(
|
|
941
981
|
size_t ctx_size;
|
942
982
|
size_t mmapped_size;
|
943
983
|
ml->calc_sizes(&ctx_size, &mmapped_size);
|
944
|
-
fprintf(stderr, "%s: ggml ctx size = %
|
945
|
-
|
946
|
-
// print memory requirements
|
947
|
-
{
|
948
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
949
|
-
|
950
|
-
// this is the total memory required to run the inference
|
951
|
-
const size_t mem_required =
|
952
|
-
ctx_size +
|
953
|
-
mmapped_size +
|
954
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
955
|
-
MEM_REQ_SCRATCH1().at(model.type) +
|
956
|
-
MEM_REQ_EVAL().at(model.type);
|
957
|
-
|
958
|
-
// this is the memory required by one llama_state
|
959
|
-
const size_t mem_required_state =
|
960
|
-
scale*MEM_REQ_KV_SELF().at(model.type);
|
961
|
-
|
962
|
-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
963
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
964
|
-
}
|
984
|
+
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
965
985
|
|
966
986
|
// create the ggml context
|
967
987
|
{
|
@@ -983,7 +1003,14 @@ static void llama_model_load_internal(
|
|
983
1003
|
}
|
984
1004
|
}
|
985
1005
|
|
1006
|
+
#ifdef GGML_USE_CUBLAS
|
1007
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
1008
|
+
#else
|
1009
|
+
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
1010
|
+
#endif
|
1011
|
+
|
986
1012
|
// prepare memory for the weights
|
1013
|
+
size_t vram_total = 0;
|
987
1014
|
{
|
988
1015
|
const uint32_t n_embd = hparams.n_embd;
|
989
1016
|
const uint32_t n_layer = hparams.n_layer;
|
@@ -991,33 +1018,87 @@ static void llama_model_load_internal(
|
|
991
1018
|
|
992
1019
|
ml->ggml_ctx = ctx;
|
993
1020
|
|
994
|
-
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
995
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
996
|
-
|
1021
|
+
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1022
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1023
|
+
|
1024
|
+
// "output" tensor
|
1025
|
+
{
|
1026
|
+
ggml_backend backend_output;
|
1027
|
+
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1028
|
+
backend_output = LLAMA_BACKEND_OFFLOAD;
|
1029
|
+
} else {
|
1030
|
+
backend_output = GGML_BACKEND_CPU;
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
997
1037
|
|
998
1038
|
model.layers.resize(n_layer);
|
999
1039
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1040
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1041
|
+
|
1000
1042
|
auto & layer = model.layers[i];
|
1001
1043
|
|
1002
1044
|
std::string layers_i = "layers." + std::to_string(i);
|
1003
1045
|
|
1004
|
-
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
1046
|
+
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
1047
|
+
|
1048
|
+
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
1049
|
+
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
1050
|
+
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
1051
|
+
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
1005
1052
|
|
1006
|
-
layer.
|
1007
|
-
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
1008
|
-
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
1009
|
-
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
1053
|
+
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
1010
1054
|
|
1011
|
-
layer.
|
1055
|
+
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
1056
|
+
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
1057
|
+
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
1012
1058
|
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1059
|
+
if (backend == GGML_BACKEND_CUDA) {
|
1060
|
+
vram_total +=
|
1061
|
+
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1062
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
1063
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1064
|
+
}
|
1016
1065
|
}
|
1017
1066
|
}
|
1018
1067
|
|
1019
1068
|
ml->done_getting_tensors();
|
1020
1069
|
|
1070
|
+
// print memory requirements
|
1071
|
+
{
|
1072
|
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1073
|
+
|
1074
|
+
// this is the total memory required to run the inference
|
1075
|
+
const size_t mem_required =
|
1076
|
+
ctx_size +
|
1077
|
+
mmapped_size - vram_total + // weights in VRAM not in memory
|
1078
|
+
MEM_REQ_SCRATCH0().at(model.type) +
|
1079
|
+
MEM_REQ_SCRATCH1().at(model.type) +
|
1080
|
+
MEM_REQ_EVAL().at(model.type);
|
1081
|
+
|
1082
|
+
// this is the memory required by one llama_state
|
1083
|
+
const size_t mem_required_state =
|
1084
|
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
1085
|
+
|
1086
|
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
1087
|
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1088
|
+
|
1089
|
+
#ifdef GGML_USE_CUBLAS
|
1090
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1091
|
+
|
1092
|
+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
1093
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
1094
|
+
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
1095
|
+
}
|
1096
|
+
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
1097
|
+
#elif !defined(GGML_USE_CLBLAST)
|
1098
|
+
(void) n_gpu_layers;
|
1099
|
+
#endif
|
1100
|
+
}
|
1101
|
+
|
1021
1102
|
// populate `tensors_by_name`
|
1022
1103
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1023
1104
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
@@ -1025,37 +1106,61 @@ static void llama_model_load_internal(
|
|
1025
1106
|
|
1026
1107
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1027
1108
|
|
1028
|
-
model.mapping = std::move(ml->mapping);
|
1029
1109
|
#ifdef GGML_USE_CUBLAS
|
1110
|
+
{
|
1111
|
+
size_t done_size = 0;
|
1112
|
+
size_t data_size = 0;
|
1113
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1114
|
+
data_size += lt.size;
|
1115
|
+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1116
|
+
done_size += lt.size;
|
1117
|
+
}
|
1118
|
+
}
|
1119
|
+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1120
|
+
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
1121
|
+
continue;
|
1122
|
+
}
|
1123
|
+
if (progress_callback) {
|
1124
|
+
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1125
|
+
}
|
1126
|
+
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1127
|
+
done_size += lt.size;
|
1128
|
+
}
|
1129
|
+
}
|
1130
|
+
#elif defined(GGML_USE_CLBLAST)
|
1030
1131
|
{
|
1031
1132
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1032
1133
|
|
1033
|
-
fprintf(stderr, "
|
1134
|
+
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
|
1034
1135
|
|
1035
1136
|
size_t vram_total = 0;
|
1036
1137
|
|
1037
1138
|
for (int i = 0; i < n_gpu; ++i) {
|
1038
1139
|
const auto & layer = model.layers[i];
|
1039
1140
|
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1141
|
+
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
1142
|
+
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
1143
|
+
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
1144
|
+
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
1145
|
+
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
1146
|
+
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
1147
|
+
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
1047
1148
|
}
|
1048
1149
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1049
|
-
fprintf(stderr, "
|
1050
|
-
|
1150
|
+
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
|
1151
|
+
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
1051
1152
|
}
|
1052
1153
|
|
1053
|
-
fprintf(stderr, "
|
1154
|
+
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
|
1054
1155
|
}
|
1055
|
-
#else
|
1056
|
-
(void) n_gpu_layers;
|
1057
1156
|
#endif
|
1058
1157
|
|
1158
|
+
if (progress_callback) {
|
1159
|
+
progress_callback(1.0f, progress_callback_user_data);
|
1160
|
+
}
|
1161
|
+
|
1162
|
+
model.mapping = std::move(ml->mapping);
|
1163
|
+
|
1059
1164
|
// loading time will be recalculate after the first eval, so
|
1060
1165
|
// we take page faults deferred by mmap() into consideration
|
1061
1166
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
@@ -1153,10 +1258,8 @@ static bool llama_eval_internal(
|
|
1153
1258
|
{
|
1154
1259
|
cur = ggml_rms_norm(ctx0, inpL);
|
1155
1260
|
|
1156
|
-
// cur = attention_norm
|
1157
|
-
cur = ggml_mul(ctx0,
|
1158
|
-
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
1159
|
-
cur);
|
1261
|
+
// cur = cur*attention_norm(broadcasted)
|
1262
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
1160
1263
|
}
|
1161
1264
|
|
1162
1265
|
// self-attention
|
@@ -1263,10 +1366,8 @@ static bool llama_eval_internal(
|
|
1263
1366
|
{
|
1264
1367
|
cur = ggml_rms_norm(ctx0, inpFF);
|
1265
1368
|
|
1266
|
-
// cur = ffn_norm
|
1267
|
-
cur = ggml_mul(ctx0,
|
1268
|
-
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
1269
|
-
cur);
|
1369
|
+
// cur = cur*ffn_norm(broadcasted)
|
1370
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
1270
1371
|
}
|
1271
1372
|
|
1272
1373
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
@@ -1303,10 +1404,8 @@ static bool llama_eval_internal(
|
|
1303
1404
|
|
1304
1405
|
inpL = ggml_rms_norm(ctx0, inpL);
|
1305
1406
|
|
1306
|
-
// inpL = norm
|
1307
|
-
inpL = ggml_mul(ctx0,
|
1308
|
-
ggml_repeat(ctx0, model.norm, inpL),
|
1309
|
-
inpL);
|
1407
|
+
// inpL = inpL*norm(broadcasted)
|
1408
|
+
inpL = ggml_mul(ctx0, inpL, model.norm);
|
1310
1409
|
|
1311
1410
|
embeddings = inpL;
|
1312
1411
|
}
|
@@ -2130,7 +2229,7 @@ struct llama_context * llama_init_from_file(
|
|
2130
2229
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
2131
2230
|
unsigned percentage = (unsigned) (100 * progress);
|
2132
2231
|
while (percentage > *cur_percentage_p) {
|
2133
|
-
|
2232
|
+
*cur_percentage_p = percentage;
|
2134
2233
|
fprintf(stderr, ".");
|
2135
2234
|
fflush(stderr);
|
2136
2235
|
if (percentage >= 100) {
|
@@ -2223,7 +2322,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2223
2322
|
{
|
2224
2323
|
uint32_t magic;
|
2225
2324
|
fin.read((char *) &magic, sizeof(magic));
|
2226
|
-
if (magic !=
|
2325
|
+
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
2227
2326
|
fprintf(stderr, "%s: bad file magic\n", __func__);
|
2228
2327
|
return 1;
|
2229
2328
|
}
|
@@ -2287,7 +2386,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2287
2386
|
|
2288
2387
|
// maybe this should in llama_model_loader
|
2289
2388
|
if (model_loader->use_mmap) {
|
2290
|
-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */
|
2389
|
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
2291
2390
|
}
|
2292
2391
|
}
|
2293
2392
|
|
@@ -2380,7 +2479,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2380
2479
|
}
|
2381
2480
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
2382
2481
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
2383
|
-
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
2482
|
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
2384
2483
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
2385
2484
|
model_loader->load_data_for(lt);
|
2386
2485
|
lt.ggml_tensor->data = lt.data;
|
@@ -2606,8 +2705,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
2606
2705
|
}
|
2607
2706
|
|
2608
2707
|
// Sets the state reading from the specified source address
|
2609
|
-
size_t llama_set_state_data(struct llama_context * ctx,
|
2610
|
-
|
2708
|
+
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
2709
|
+
uint8_t * inp = src;
|
2611
2710
|
|
2612
2711
|
// set rng
|
2613
2712
|
{
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,10 +19,16 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
23
|
-
#define
|
24
|
-
#define
|
25
|
-
#define
|
22
|
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
23
|
+
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
24
|
+
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
25
|
+
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
26
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
27
|
+
|
28
|
+
#define LLAMA_FILE_VERSION 3
|
29
|
+
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
30
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
31
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
26
32
|
#define LLAMA_SESSION_VERSION 1
|
27
33
|
|
28
34
|
#ifdef __cplusplus
|
@@ -40,9 +46,9 @@ extern "C" {
|
|
40
46
|
typedef int llama_token;
|
41
47
|
|
42
48
|
typedef struct llama_token_data {
|
43
|
-
llama_token id;
|
44
|
-
float logit;
|
45
|
-
float p;
|
49
|
+
llama_token id; // token id
|
50
|
+
float logit; // log-odds of the token
|
51
|
+
float p; // probability of the token
|
46
52
|
} llama_token_data;
|
47
53
|
|
48
54
|
typedef struct llama_token_data_array {
|
@@ -73,16 +79,16 @@ extern "C" {
|
|
73
79
|
|
74
80
|
// model file types
|
75
81
|
enum llama_ftype {
|
76
|
-
LLAMA_FTYPE_ALL_F32
|
77
|
-
LLAMA_FTYPE_MOSTLY_F16
|
78
|
-
LLAMA_FTYPE_MOSTLY_Q4_0
|
79
|
-
LLAMA_FTYPE_MOSTLY_Q4_1
|
82
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
83
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
84
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
85
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
80
86
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
81
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2
|
82
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3
|
83
|
-
LLAMA_FTYPE_MOSTLY_Q8_0
|
84
|
-
LLAMA_FTYPE_MOSTLY_Q5_0
|
85
|
-
LLAMA_FTYPE_MOSTLY_Q5_1
|
87
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
88
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
86
92
|
};
|
87
93
|
|
88
94
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -90,6 +96,13 @@ extern "C" {
|
|
90
96
|
LLAMA_API bool llama_mmap_supported();
|
91
97
|
LLAMA_API bool llama_mlock_supported();
|
92
98
|
|
99
|
+
// TODO: not great API - very likely to change
|
100
|
+
// Initialize the llama + ggml backend
|
101
|
+
// Call once at the start of the program
|
102
|
+
LLAMA_API void llama_init_backend();
|
103
|
+
|
104
|
+
LLAMA_API int64_t llama_time_us();
|
105
|
+
|
93
106
|
// Various functions for loading a ggml llama model.
|
94
107
|
// Allocate (almost) all memory needed for the model.
|
95
108
|
// Return NULL on failure
|
@@ -138,7 +151,7 @@ extern "C" {
|
|
138
151
|
|
139
152
|
// Set the state reading from the specified address
|
140
153
|
// Returns the number of bytes read
|
141
|
-
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx,
|
154
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
142
155
|
|
143
156
|
// Save/load session file
|
144
157
|
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-66874d4'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -14,6 +14,7 @@ module LLaMACpp
|
|
14
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
15
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
16
16
|
|
17
|
+
def self?.init_backend: () -> void
|
17
18
|
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
18
19
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
19
20
|
def self?.print_system_info: () -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -27,7 +27,7 @@ files:
|
|
27
27
|
- ext/llama_cpp/llama_cpp.h
|
28
28
|
- ext/llama_cpp/src/LICENSE
|
29
29
|
- ext/llama_cpp/src/ggml-cuda.h
|
30
|
-
- ext/llama_cpp/src/ggml-opencl.
|
30
|
+
- ext/llama_cpp/src/ggml-opencl.cpp
|
31
31
|
- ext/llama_cpp/src/ggml-opencl.h
|
32
32
|
- ext/llama_cpp/src/ggml.c
|
33
33
|
- ext/llama_cpp/src/ggml.h
|