llama_cpp 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/examples/README.md +92 -0
- data/examples/chat.rb +195 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1218 -411
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +703 -514
- data/ext/llama_cpp/src/ggml-metal.metal +574 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +496 -36
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +2715 -476
- data/ext/llama_cpp/src/ggml.h +266 -11
- data/ext/llama_cpp/src/llama.cpp +266 -135
- data/ext/llama_cpp/src/llama.h +19 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +5 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,11 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_K_QUANTS
|
23
|
+
#ifndef QK_K
|
24
|
+
#define QK_K 256
|
25
|
+
#endif
|
26
|
+
#endif
|
22
27
|
|
23
28
|
#include <array>
|
24
29
|
#include <ctime>
|
@@ -40,6 +45,10 @@
|
|
40
45
|
#include <sstream>
|
41
46
|
#include <numeric>
|
42
47
|
|
48
|
+
#if defined(_MSC_VER)
|
49
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
50
|
+
#endif
|
51
|
+
|
43
52
|
#define LLAMA_USE_SCRATCH
|
44
53
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
45
54
|
|
@@ -165,6 +174,11 @@ struct llama_kv_cache {
|
|
165
174
|
if (ctx) {
|
166
175
|
ggml_free(ctx);
|
167
176
|
}
|
177
|
+
|
178
|
+
#ifdef GGML_USE_CUBLAS
|
179
|
+
ggml_cuda_free_data(k);
|
180
|
+
ggml_cuda_free_data(v);
|
181
|
+
#endif // GGML_USE_CUBLAS
|
168
182
|
}
|
169
183
|
};
|
170
184
|
|
@@ -210,6 +224,7 @@ struct llama_model {
|
|
210
224
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
211
225
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
212
226
|
}
|
227
|
+
ggml_cuda_free_scratch();
|
213
228
|
#elif defined(GGML_USE_CLBLAST)
|
214
229
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
215
230
|
ggml_cl_free_data(tensors_by_name[i].second);
|
@@ -707,6 +722,9 @@ struct llama_model_loader {
|
|
707
722
|
|
708
723
|
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
709
724
|
struct ggml_tensor * tensor;
|
725
|
+
if (backend != GGML_BACKEND_CPU) {
|
726
|
+
ggml_set_no_alloc(ggml_ctx, true);
|
727
|
+
}
|
710
728
|
if (lt.ne.size() == 2) {
|
711
729
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
712
730
|
} else {
|
@@ -716,6 +734,9 @@ struct llama_model_loader {
|
|
716
734
|
ggml_set_name(tensor, lt.name.c_str());
|
717
735
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
718
736
|
|
737
|
+
if (backend != GGML_BACKEND_CPU) {
|
738
|
+
ggml_set_no_alloc(ggml_ctx, use_mmap);
|
739
|
+
}
|
719
740
|
tensor->backend = backend;
|
720
741
|
lt.ggml_tensor = tensor;
|
721
742
|
num_ggml_tensors_created++;
|
@@ -731,6 +752,7 @@ struct llama_model_loader {
|
|
731
752
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
732
753
|
size_t data_size = 0;
|
733
754
|
size_t prefetch_size = 0;
|
755
|
+
size_t lock_size = 0;
|
734
756
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
735
757
|
data_size += lt.size;
|
736
758
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
@@ -740,11 +762,6 @@ struct llama_model_loader {
|
|
740
762
|
|
741
763
|
if (use_mmap) {
|
742
764
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
743
|
-
if (!lmlock) {
|
744
|
-
// Don't call the callback since the actual loading will be lazy
|
745
|
-
// and we can't measure it.
|
746
|
-
progress_callback = NULL;
|
747
|
-
}
|
748
765
|
if (lmlock) {
|
749
766
|
lmlock->init(mapping->addr);
|
750
767
|
}
|
@@ -752,20 +769,49 @@ struct llama_model_loader {
|
|
752
769
|
|
753
770
|
size_t done_size = 0;
|
754
771
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
755
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
756
|
-
continue;
|
757
|
-
}
|
758
772
|
if (progress_callback) {
|
759
773
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
760
774
|
}
|
761
775
|
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
762
776
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
777
|
+
|
778
|
+
// allocate temp buffer if not using mmap
|
779
|
+
if (!use_mmap && lt.data == NULL) {
|
780
|
+
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
|
781
|
+
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
|
782
|
+
}
|
783
|
+
|
763
784
|
load_data_for(lt);
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
785
|
+
|
786
|
+
switch(lt.ggml_tensor->backend) {
|
787
|
+
case GGML_BACKEND_CPU:
|
788
|
+
lt.ggml_tensor->data = lt.data;
|
789
|
+
if (use_mmap && lmlock) {
|
790
|
+
lock_size += lt.size;
|
791
|
+
lmlock->grow_to(lock_size);
|
792
|
+
}
|
793
|
+
break;
|
794
|
+
#if defined(GGML_USE_CUBLAS)
|
795
|
+
case GGML_BACKEND_GPU:
|
796
|
+
case GGML_BACKEND_GPU_SPLIT:
|
797
|
+
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
798
|
+
if (!use_mmap) {
|
799
|
+
free(lt.data);
|
800
|
+
}
|
801
|
+
break;
|
802
|
+
#elif defined(GGML_USE_CLBLAST)
|
803
|
+
case GGML_BACKEND_GPU:
|
804
|
+
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
|
805
|
+
if (!use_mmap) {
|
806
|
+
free(lt.data);
|
807
|
+
}
|
808
|
+
break;
|
809
|
+
#endif
|
810
|
+
default:
|
811
|
+
continue;
|
768
812
|
}
|
813
|
+
|
814
|
+
done_size += lt.size;
|
769
815
|
}
|
770
816
|
}
|
771
817
|
|
@@ -836,7 +882,8 @@ static bool kv_cache_init(
|
|
836
882
|
const struct llama_hparams & hparams,
|
837
883
|
struct llama_kv_cache & cache,
|
838
884
|
ggml_type wtype,
|
839
|
-
int n_ctx
|
885
|
+
int n_ctx,
|
886
|
+
int n_gpu_layers) {
|
840
887
|
const int n_embd = hparams.n_embd;
|
841
888
|
const int n_layer = hparams.n_layer;
|
842
889
|
|
@@ -844,6 +891,7 @@ static bool kv_cache_init(
|
|
844
891
|
const int64_t n_elements = n_embd*n_mem;
|
845
892
|
|
846
893
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
894
|
+
cache.n = 0;
|
847
895
|
|
848
896
|
struct ggml_init_params params;
|
849
897
|
params.mem_size = cache.buf.size;
|
@@ -862,25 +910,36 @@ static bool kv_cache_init(
|
|
862
910
|
ggml_set_name(cache.k, "cache_k");
|
863
911
|
ggml_set_name(cache.v, "cache_v");
|
864
912
|
|
913
|
+
(void) n_gpu_layers;
|
914
|
+
#ifdef GGML_USE_CUBLAS
|
915
|
+
if (n_gpu_layers > n_layer + 1) {
|
916
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
917
|
+
}
|
918
|
+
if (n_gpu_layers > n_layer + 2) {
|
919
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
920
|
+
}
|
921
|
+
#endif // GGML_USE_CUBLAS
|
922
|
+
|
865
923
|
return true;
|
866
924
|
}
|
867
925
|
|
868
926
|
struct llama_context_params llama_context_default_params() {
|
869
927
|
struct llama_context_params result = {
|
928
|
+
/*.seed =*/ -1,
|
870
929
|
/*.n_ctx =*/ 512,
|
871
930
|
/*.n_batch =*/ 512,
|
872
931
|
/*.gpu_layers =*/ 0,
|
873
932
|
/*.main_gpu =*/ 0,
|
874
933
|
/*.tensor_split =*/ {0},
|
875
|
-
/*.
|
934
|
+
/*.progress_callback =*/ nullptr,
|
935
|
+
/*.progress_callback_user_data =*/ nullptr,
|
936
|
+
/*.low_vram =*/ false,
|
876
937
|
/*.f16_kv =*/ true,
|
877
938
|
/*.logits_all =*/ false,
|
878
939
|
/*.vocab_only =*/ false,
|
879
940
|
/*.use_mmap =*/ true,
|
880
941
|
/*.use_mlock =*/ false,
|
881
942
|
/*.embedding =*/ false,
|
882
|
-
/*.progress_callback =*/ nullptr,
|
883
|
-
/*.progress_callback_user_data =*/ nullptr,
|
884
943
|
};
|
885
944
|
|
886
945
|
return result;
|
@@ -980,6 +1039,7 @@ static void llama_model_load_internal(
|
|
980
1039
|
int n_gpu_layers,
|
981
1040
|
int main_gpu,
|
982
1041
|
const float * tensor_split,
|
1042
|
+
bool low_vram,
|
983
1043
|
ggml_type memory_type,
|
984
1044
|
bool use_mmap,
|
985
1045
|
bool use_mlock,
|
@@ -1005,6 +1065,12 @@ static void llama_model_load_internal(
|
|
1005
1065
|
case 40: model.type = e_model::MODEL_13B; break;
|
1006
1066
|
case 60: model.type = e_model::MODEL_30B; break;
|
1007
1067
|
case 80: model.type = e_model::MODEL_65B; break;
|
1068
|
+
default:
|
1069
|
+
{
|
1070
|
+
if (hparams.n_layer < 32) {
|
1071
|
+
model.type = e_model::MODEL_7B;
|
1072
|
+
}
|
1073
|
+
} break;
|
1008
1074
|
}
|
1009
1075
|
|
1010
1076
|
hparams.n_ctx = n_ctx;
|
@@ -1100,18 +1166,34 @@ static void llama_model_load_internal(
|
|
1100
1166
|
ml->ggml_ctx = ctx;
|
1101
1167
|
|
1102
1168
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1103
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1104
1169
|
|
1105
1170
|
// "output" tensor
|
1106
1171
|
{
|
1172
|
+
ggml_backend backend_norm;
|
1107
1173
|
ggml_backend backend_output;
|
1108
1174
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1175
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1176
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
1177
|
+
#ifndef _WIN32
|
1178
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1179
|
+
#else
|
1180
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1181
|
+
#endif // _WIN32
|
1182
|
+
|
1109
1183
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1110
1184
|
} else {
|
1185
|
+
backend_norm = GGML_BACKEND_CPU;
|
1111
1186
|
backend_output = GGML_BACKEND_CPU;
|
1112
1187
|
}
|
1113
1188
|
|
1189
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
|
1114
1190
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1191
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
1192
|
+
vram_weights += ggml_nbytes(model.norm);
|
1193
|
+
}
|
1194
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
1195
|
+
vram_weights += ggml_nbytes(model.output);
|
1196
|
+
}
|
1115
1197
|
}
|
1116
1198
|
|
1117
1199
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1141,7 +1223,7 @@ static void llama_model_load_internal(
|
|
1141
1223
|
if (backend == GGML_BACKEND_GPU) {
|
1142
1224
|
vram_weights +=
|
1143
1225
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1144
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.
|
1226
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
1145
1227
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1146
1228
|
}
|
1147
1229
|
}
|
@@ -1169,23 +1251,49 @@ static void llama_model_load_internal(
|
|
1169
1251
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1170
1252
|
|
1171
1253
|
(void) vram_scratch;
|
1254
|
+
(void) n_batch;
|
1172
1255
|
#ifdef GGML_USE_CUBLAS
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1256
|
+
if (low_vram) {
|
1257
|
+
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1258
|
+
ggml_cuda_set_scratch_size(0); // disable scratch
|
1259
|
+
} else {
|
1260
|
+
vram_scratch = n_batch * MB;
|
1261
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1262
|
+
if (n_gpu_layers > 0) {
|
1263
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
1264
|
+
__func__, vram_scratch / MB);
|
1265
|
+
}
|
1178
1266
|
}
|
1179
1267
|
#endif // GGML_USE_CUBLAS
|
1180
1268
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1181
1269
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1182
1270
|
|
1183
|
-
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
1271
|
+
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1184
1272
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1185
|
-
fprintf(stderr, "%s: offloading
|
1273
|
+
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1274
|
+
}
|
1275
|
+
size_t vram_kv_cache = 0;
|
1276
|
+
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1277
|
+
if (low_vram) {
|
1278
|
+
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1279
|
+
} else {
|
1280
|
+
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1281
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1282
|
+
}
|
1283
|
+
}
|
1284
|
+
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1285
|
+
if (low_vram) {
|
1286
|
+
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1287
|
+
} else {
|
1288
|
+
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1289
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1290
|
+
}
|
1186
1291
|
}
|
1292
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1293
|
+
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1294
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
1187
1295
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1188
|
-
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
1296
|
+
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1189
1297
|
#else
|
1190
1298
|
(void) n_gpu_layers;
|
1191
1299
|
#endif
|
@@ -1196,58 +1304,15 @@ static void llama_model_load_internal(
|
|
1196
1304
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
1197
1305
|
}
|
1198
1306
|
|
1199
|
-
|
1200
|
-
|
1307
|
+
(void) tensor_split;
|
1201
1308
|
#if defined(GGML_USE_CUBLAS)
|
1202
1309
|
{
|
1203
1310
|
ggml_cuda_set_tensor_split(tensor_split);
|
1204
|
-
|
1205
|
-
size_t done_size = 0;
|
1206
|
-
size_t data_size = 0;
|
1207
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1208
|
-
data_size += lt.size;
|
1209
|
-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1210
|
-
done_size += lt.size;
|
1211
|
-
}
|
1212
|
-
}
|
1213
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1214
|
-
ggml_backend backend = lt.ggml_tensor->backend;
|
1215
|
-
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
1216
|
-
continue;
|
1217
|
-
}
|
1218
|
-
if (progress_callback) {
|
1219
|
-
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1220
|
-
}
|
1221
|
-
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1222
|
-
done_size += lt.size;
|
1223
|
-
}
|
1224
|
-
}
|
1225
|
-
#elif defined(GGML_USE_CLBLAST)
|
1226
|
-
{
|
1227
|
-
size_t done_size = 0;
|
1228
|
-
size_t data_size = 0;
|
1229
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1230
|
-
data_size += lt.size;
|
1231
|
-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1232
|
-
done_size += lt.size;
|
1233
|
-
}
|
1234
|
-
}
|
1235
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1236
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
1237
|
-
continue;
|
1238
|
-
}
|
1239
|
-
if (progress_callback) {
|
1240
|
-
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1241
|
-
}
|
1242
|
-
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1243
|
-
done_size += lt.size;
|
1244
|
-
}
|
1245
1311
|
}
|
1246
|
-
#else
|
1247
|
-
(void) n_batch;
|
1248
|
-
(void) tensor_split;
|
1249
1312
|
#endif
|
1250
1313
|
|
1314
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1315
|
+
|
1251
1316
|
if (progress_callback) {
|
1252
1317
|
progress_callback(1.0f, progress_callback_user_data);
|
1253
1318
|
}
|
@@ -1267,6 +1332,7 @@ static bool llama_model_load(
|
|
1267
1332
|
int n_gpu_layers,
|
1268
1333
|
int main_gpu,
|
1269
1334
|
float * tensor_split,
|
1335
|
+
bool low_vram,
|
1270
1336
|
ggml_type memory_type,
|
1271
1337
|
bool use_mmap,
|
1272
1338
|
bool use_mlock,
|
@@ -1274,7 +1340,7 @@ static bool llama_model_load(
|
|
1274
1340
|
llama_progress_callback progress_callback,
|
1275
1341
|
void *progress_callback_user_data) {
|
1276
1342
|
try {
|
1277
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
1343
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1278
1344
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1279
1345
|
return true;
|
1280
1346
|
} catch (const std::exception & err) {
|
@@ -1350,12 +1416,33 @@ static bool llama_eval_internal(
|
|
1350
1416
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1351
1417
|
(void) i_gpu_start;
|
1352
1418
|
|
1419
|
+
// offload functions set the tensor output backend to GPU
|
1420
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
1421
|
+
//
|
1422
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
1423
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
1424
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
1425
|
+
offload_func_t offload_func_kq = llama_nop;
|
1426
|
+
offload_func_t offload_func_v = llama_nop;
|
1427
|
+
|
1428
|
+
#ifdef GGML_USE_CUBLAS
|
1429
|
+
if (n_gpu_layers > n_layer) {
|
1430
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1431
|
+
}
|
1432
|
+
if (n_gpu_layers > n_layer + 1) {
|
1433
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1434
|
+
}
|
1435
|
+
if (n_gpu_layers > n_layer + 2) {
|
1436
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1437
|
+
}
|
1438
|
+
#endif // GGML_USE_CUBLAS
|
1439
|
+
|
1353
1440
|
for (int il = 0; il < n_layer; ++il) {
|
1354
1441
|
offload_func_t offload_func = llama_nop;
|
1355
1442
|
|
1356
1443
|
#ifdef GGML_USE_CUBLAS
|
1357
1444
|
if (il >= i_gpu_start) {
|
1358
|
-
offload_func = ggml_cuda_assign_buffers;
|
1445
|
+
offload_func = ggml_cuda_assign_buffers;
|
1359
1446
|
}
|
1360
1447
|
#endif // GGML_USE_CUBLAS
|
1361
1448
|
|
@@ -1378,31 +1465,42 @@ static bool llama_eval_internal(
|
|
1378
1465
|
// self-attention
|
1379
1466
|
{
|
1380
1467
|
// compute Q and K and RoPE them
|
1381
|
-
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1382
|
-
// offload_func(tmpq);
|
1383
|
-
ggml_set_name(tmpq, "tmpq");
|
1384
|
-
|
1385
1468
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1386
|
-
|
1469
|
+
offload_func_kq(tmpk);
|
1387
1470
|
ggml_set_name(tmpk, "tmpk");
|
1388
1471
|
|
1472
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1473
|
+
offload_func_kq(tmpq);
|
1474
|
+
ggml_set_name(tmpq, "tmpq");
|
1475
|
+
|
1389
1476
|
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1477
|
+
offload_func_kq(Kcur);
|
1390
1478
|
ggml_set_name(Kcur, "Kcur");
|
1391
1479
|
|
1392
1480
|
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1481
|
+
offload_func_kq(Qcur);
|
1393
1482
|
ggml_set_name(Qcur, "Qcur");
|
1394
1483
|
|
1395
1484
|
// store key and value to memory
|
1396
1485
|
{
|
1397
1486
|
// compute the transposed [N, n_embd] V matrix
|
1398
|
-
|
1487
|
+
|
1488
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
1489
|
+
offload_func_v(tmpv);
|
1490
|
+
ggml_set_name(tmpv, "tmpv");
|
1491
|
+
|
1492
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
|
1493
|
+
offload_func_v(Vcur);
|
1399
1494
|
ggml_set_name(Vcur, "Vcur");
|
1400
1495
|
|
1401
1496
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1497
|
+
offload_func_kq(k);
|
1402
1498
|
ggml_set_name(k, "k");
|
1499
|
+
|
1403
1500
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1404
1501
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1405
1502
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1503
|
+
offload_func_v(v);
|
1406
1504
|
ggml_set_name(v, "v");
|
1407
1505
|
|
1408
1506
|
// important: storing RoPE-ed version of K in the KV cache!
|
@@ -1414,6 +1512,7 @@ static bool llama_eval_internal(
|
|
1414
1512
|
ggml_permute(ctx0,
|
1415
1513
|
Qcur,
|
1416
1514
|
0, 2, 1, 3);
|
1515
|
+
offload_func_kq(Q);
|
1417
1516
|
ggml_set_name(Q, "Q");
|
1418
1517
|
|
1419
1518
|
struct ggml_tensor * K =
|
@@ -1422,10 +1521,12 @@ static bool llama_eval_internal(
|
|
1422
1521
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1423
1522
|
n_embd/n_head, n_head, n_past + N),
|
1424
1523
|
0, 2, 1, 3);
|
1524
|
+
offload_func_kq(K);
|
1425
1525
|
ggml_set_name(K, "K");
|
1426
1526
|
|
1427
1527
|
// K * Q
|
1428
1528
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1529
|
+
offload_func_kq(KQ);
|
1429
1530
|
ggml_set_name(KQ, "KQ");
|
1430
1531
|
|
1431
1532
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
@@ -1434,14 +1535,17 @@ static bool llama_eval_internal(
|
|
1434
1535
|
|
1435
1536
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1436
1537
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1538
|
+
offload_func_kq(KQ_scaled);
|
1437
1539
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1438
1540
|
|
1439
1541
|
// KQ_masked = mask_past(KQ_scaled)
|
1440
1542
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1543
|
+
offload_func_kq(KQ_masked);
|
1441
1544
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1442
1545
|
|
1443
1546
|
// KQ = soft_max(KQ_masked)
|
1444
1547
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1548
|
+
offload_func_v(KQ_soft_max);
|
1445
1549
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1446
1550
|
|
1447
1551
|
// split cached V into n_head heads
|
@@ -1451,10 +1555,12 @@ static bool llama_eval_internal(
|
|
1451
1555
|
n_ctx*ggml_element_size(kv_self.v),
|
1452
1556
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1453
1557
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1558
|
+
offload_func_v(V);
|
1454
1559
|
ggml_set_name(V, "V");
|
1455
1560
|
|
1456
1561
|
#if 1
|
1457
1562
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1563
|
+
offload_func_v(KQV);
|
1458
1564
|
ggml_set_name(KQV, "KQV");
|
1459
1565
|
#else
|
1460
1566
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
@@ -1466,12 +1572,14 @@ static bool llama_eval_internal(
|
|
1466
1572
|
|
1467
1573
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1468
1574
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1575
|
+
offload_func_v(KQV_merged);
|
1469
1576
|
ggml_set_name(KQV_merged, "KQV_merged");
|
1470
1577
|
|
1471
1578
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1472
1579
|
cur = ggml_cpy(ctx0,
|
1473
1580
|
KQV_merged,
|
1474
1581
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1582
|
+
offload_func_v(cur);
|
1475
1583
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
1476
1584
|
|
1477
1585
|
// projection (no bias)
|
@@ -1483,7 +1591,6 @@ static bool llama_eval_internal(
|
|
1483
1591
|
}
|
1484
1592
|
|
1485
1593
|
lctx.use_buf(ctx0, 1);
|
1486
|
-
//ggml_cuda_set_scratch(1);
|
1487
1594
|
|
1488
1595
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1489
1596
|
offload_func(inpFF);
|
@@ -1513,7 +1620,7 @@ static bool llama_eval_internal(
|
|
1513
1620
|
model.layers[il].w1,
|
1514
1621
|
cur);
|
1515
1622
|
offload_func(cur);
|
1516
|
-
ggml_set_name(cur, "
|
1623
|
+
ggml_set_name(cur, "result_w1");
|
1517
1624
|
|
1518
1625
|
// SILU activation
|
1519
1626
|
cur = ggml_silu(ctx0, cur);
|
@@ -1541,32 +1648,20 @@ static bool llama_eval_internal(
|
|
1541
1648
|
}
|
1542
1649
|
|
1543
1650
|
lctx.use_buf(ctx0, 0);
|
1544
|
-
//ggml_cuda_set_scratch(0);
|
1545
1651
|
|
1546
1652
|
// used at the end to optionally extract the embeddings
|
1547
1653
|
struct ggml_tensor * embeddings = NULL;
|
1548
1654
|
|
1549
|
-
offload_func_t offload_func = llama_nop;
|
1550
|
-
|
1551
|
-
#ifdef GGML_USE_CUBLAS
|
1552
|
-
if (n_gpu_layers > n_layer) {
|
1553
|
-
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1554
|
-
}
|
1555
|
-
#endif // GGML_USE_CUBLAS
|
1556
1655
|
|
1557
1656
|
// norm
|
1558
1657
|
{
|
1559
1658
|
cur = ggml_rms_norm(ctx0, inpL);
|
1560
|
-
|
1561
|
-
ggml_set_name(cur, "
|
1562
|
-
|
1563
|
-
cur = ggml_rms_norm(ctx0, cur);
|
1564
|
-
offload_func(cur);
|
1565
|
-
ggml_set_name(cur, "rms_norm_after");
|
1659
|
+
offload_func_nr(cur);
|
1660
|
+
ggml_set_name(cur, "rms_norm_2");
|
1566
1661
|
|
1567
1662
|
// cur = cur*norm(broadcasted)
|
1568
1663
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1569
|
-
|
1664
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1570
1665
|
ggml_set_name(cur, "result_norm");
|
1571
1666
|
|
1572
1667
|
embeddings = cur;
|
@@ -2174,6 +2269,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2174
2269
|
return -log2f(candidate.p) > *mu;
|
2175
2270
|
}));
|
2176
2271
|
|
2272
|
+
if (candidates->size == 0) {
|
2273
|
+
candidates->size = 1;
|
2274
|
+
}
|
2275
|
+
|
2177
2276
|
// Normalize the probabilities of the remaining words
|
2178
2277
|
llama_sample_softmax(ctx, candidates);
|
2179
2278
|
|
@@ -2311,7 +2410,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2311
2410
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2312
2411
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2313
2412
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2413
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2414
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2314
2415
|
|
2416
|
+
#ifdef GGML_USE_K_QUANTS
|
2315
2417
|
// K-quants
|
2316
2418
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2317
2419
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
@@ -2322,6 +2424,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2322
2424
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2323
2425
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2324
2426
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2427
|
+
#endif
|
2325
2428
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2326
2429
|
}
|
2327
2430
|
|
@@ -2333,6 +2436,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2333
2436
|
/*vocab_only*/ false));
|
2334
2437
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2335
2438
|
|
2439
|
+
#ifdef GGML_USE_K_QUANTS
|
2336
2440
|
int n_attention_wv = 0;
|
2337
2441
|
int n_feed_forward_w2 = 0;
|
2338
2442
|
for (auto& tensor : model_loader->tensors_map.tensors) {
|
@@ -2346,6 +2450,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2346
2450
|
|
2347
2451
|
int i_attention_wv = 0;
|
2348
2452
|
int i_feed_forward_w2 = 0;
|
2453
|
+
#endif
|
2349
2454
|
|
2350
2455
|
size_t total_size_org = 0;
|
2351
2456
|
size_t total_size_new = 0;
|
@@ -2371,12 +2476,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2371
2476
|
|
2372
2477
|
// quantize only 2D tensors
|
2373
2478
|
quantize &= (tensor.ne.size() == 2);
|
2374
|
-
|
2375
|
-
|
2376
|
-
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
2377
|
-
quantize = false;
|
2378
|
-
}
|
2379
|
-
quantize = quantize && quantized_type != tensor.type;
|
2479
|
+
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
2480
|
+
quantize &= quantized_type != tensor.type;
|
2380
2481
|
|
2381
2482
|
enum ggml_type new_type;
|
2382
2483
|
void * new_data;
|
@@ -2390,31 +2491,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2390
2491
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2391
2492
|
} else {
|
2392
2493
|
new_type = quantized_type;
|
2393
|
-
|
2394
|
-
|
2395
|
-
|
2396
|
-
|
2397
|
-
|
2398
|
-
|
2494
|
+
#ifdef GGML_USE_K_QUANTS
|
2495
|
+
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2496
|
+
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2497
|
+
int nx = tensor.ne.at(0);
|
2498
|
+
int ny = tensor.ne.at(1);
|
2499
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2500
|
+
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
2501
|
+
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
2502
|
+
fprintf(stderr, "========================================================================================\n\n");
|
2503
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2504
|
+
}
|
2505
|
+
}
|
2506
|
+
if (tensor.name == "output.weight") {
|
2507
|
+
int nx = tensor.ne.at(0);
|
2508
|
+
int ny = tensor.ne.at(1);
|
2509
|
+
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
2510
|
+
new_type = GGML_TYPE_Q6_K;
|
2511
|
+
}
|
2512
|
+
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2399
2513
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2400
2514
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2401
2515
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2402
2516
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2403
2517
|
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2404
2518
|
++i_attention_wv;
|
2405
|
-
}
|
2406
|
-
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2519
|
+
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2407
2520
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2408
2521
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2409
2522
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2410
2523
|
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2411
2524
|
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2412
2525
|
++i_feed_forward_w2;
|
2413
|
-
}
|
2414
|
-
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2526
|
+
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2415
2527
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2416
2528
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2417
2529
|
}
|
2530
|
+
#endif
|
2418
2531
|
|
2419
2532
|
float * f32_data;
|
2420
2533
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
@@ -2554,8 +2667,8 @@ struct llama_context * llama_init_from_file(
|
|
2554
2667
|
|
2555
2668
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2556
2669
|
|
2557
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2558
|
-
params.
|
2670
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2671
|
+
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2559
2672
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2560
2673
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2561
2674
|
llama_free(ctx);
|
@@ -2564,7 +2677,7 @@ struct llama_context * llama_init_from_file(
|
|
2564
2677
|
|
2565
2678
|
// reserve memory for context buffers
|
2566
2679
|
if (!params.vocab_only) {
|
2567
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
2680
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2568
2681
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2569
2682
|
llama_free(ctx);
|
2570
2683
|
return nullptr;
|
@@ -2599,16 +2712,21 @@ struct llama_context * llama_init_from_file(
|
|
2599
2712
|
// this allocates all Metal resources and memory buffers
|
2600
2713
|
ctx->ctx_metal = ggml_metal_init();
|
2601
2714
|
|
2602
|
-
void *data_ptr
|
2715
|
+
void * data_ptr = NULL;
|
2603
2716
|
size_t data_size = 0;
|
2717
|
+
|
2604
2718
|
if (params.use_mmap) {
|
2605
|
-
data_ptr
|
2606
|
-
data_size= ctx->model.mapping->size;
|
2719
|
+
data_ptr = ctx->model.mapping->addr;
|
2720
|
+
data_size = ctx->model.mapping->size;
|
2607
2721
|
} else {
|
2608
|
-
data_ptr
|
2609
|
-
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2722
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2723
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
2610
2724
|
}
|
2611
2725
|
|
2726
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2727
|
+
|
2728
|
+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2729
|
+
|
2612
2730
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2613
2731
|
if (!(result)) { \
|
2614
2732
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
@@ -2616,12 +2734,13 @@ struct llama_context * llama_init_from_file(
|
|
2616
2734
|
return NULL; \
|
2617
2735
|
}
|
2618
2736
|
|
2619
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2620
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2737
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2621
2738
|
|
2622
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2623
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2624
|
-
|
2739
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2740
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
|
2741
|
+
|
2742
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
2625
2744
|
#undef LLAMA_METAL_CHECK_BUF
|
2626
2745
|
}
|
2627
2746
|
#endif
|
@@ -3007,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3007
3126
|
if (kv_size) {
|
3008
3127
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3009
3128
|
|
3010
|
-
|
3011
|
-
|
3012
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3129
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3013
3130
|
ggml_cgraph gf{};
|
3014
3131
|
gf.n_threads = 1;
|
3015
3132
|
|
@@ -3115,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3115
3232
|
|
3116
3233
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3117
3234
|
|
3118
|
-
|
3119
|
-
|
3120
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3235
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3121
3236
|
ggml_cgraph gf{};
|
3122
3237
|
gf.n_threads = 1;
|
3123
3238
|
|
@@ -3301,6 +3416,19 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3301
3416
|
return ctx->model.hparams.n_embd;
|
3302
3417
|
}
|
3303
3418
|
|
3419
|
+
int llama_get_vocab(
|
3420
|
+
const struct llama_context * ctx,
|
3421
|
+
const char * * strings,
|
3422
|
+
float * scores,
|
3423
|
+
int capacity) {
|
3424
|
+
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
3425
|
+
for (int i = 0; i<n; ++i) {
|
3426
|
+
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
3427
|
+
scores[i] = ctx->vocab.id_to_token[i].score;
|
3428
|
+
}
|
3429
|
+
return n;
|
3430
|
+
}
|
3431
|
+
|
3304
3432
|
float * llama_get_logits(struct llama_context * ctx) {
|
3305
3433
|
return ctx->logits.data();
|
3306
3434
|
}
|
@@ -3339,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
3339
3467
|
|
3340
3468
|
fprintf(stderr, "\n");
|
3341
3469
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3342
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token
|
3343
|
-
|
3344
|
-
fprintf(stderr, "%s:
|
3470
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3471
|
+
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
3472
|
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3473
|
+
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
3474
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3475
|
+
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
3345
3476
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3346
3477
|
}
|
3347
3478
|
|