llama_cpp 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +697 -130
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +548 -497
- data/ext/llama_cpp/src/ggml-metal.metal +425 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -32
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +1904 -303
- data/ext/llama_cpp/src/ggml.h +126 -2
- data/ext/llama_cpp/src/llama.cpp +212 -108
- data/ext/llama_cpp/src/llama.h +12 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -165,6 +165,11 @@ struct llama_kv_cache {
|
|
165
165
|
if (ctx) {
|
166
166
|
ggml_free(ctx);
|
167
167
|
}
|
168
|
+
|
169
|
+
#ifdef GGML_USE_CUBLAS
|
170
|
+
ggml_cuda_free_data(k);
|
171
|
+
ggml_cuda_free_data(v);
|
172
|
+
#endif // GGML_USE_CUBLAS
|
168
173
|
}
|
169
174
|
};
|
170
175
|
|
@@ -210,6 +215,7 @@ struct llama_model {
|
|
210
215
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
211
216
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
212
217
|
}
|
218
|
+
ggml_cuda_free_scratch();
|
213
219
|
#elif defined(GGML_USE_CLBLAST)
|
214
220
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
215
221
|
ggml_cl_free_data(tensors_by_name[i].second);
|
@@ -707,6 +713,9 @@ struct llama_model_loader {
|
|
707
713
|
|
708
714
|
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
709
715
|
struct ggml_tensor * tensor;
|
716
|
+
if (backend != GGML_BACKEND_CPU) {
|
717
|
+
ggml_set_no_alloc(ggml_ctx, true);
|
718
|
+
}
|
710
719
|
if (lt.ne.size() == 2) {
|
711
720
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
712
721
|
} else {
|
@@ -716,6 +725,9 @@ struct llama_model_loader {
|
|
716
725
|
ggml_set_name(tensor, lt.name.c_str());
|
717
726
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
718
727
|
|
728
|
+
if (backend != GGML_BACKEND_CPU) {
|
729
|
+
ggml_set_no_alloc(ggml_ctx, use_mmap);
|
730
|
+
}
|
719
731
|
tensor->backend = backend;
|
720
732
|
lt.ggml_tensor = tensor;
|
721
733
|
num_ggml_tensors_created++;
|
@@ -731,6 +743,7 @@ struct llama_model_loader {
|
|
731
743
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
732
744
|
size_t data_size = 0;
|
733
745
|
size_t prefetch_size = 0;
|
746
|
+
size_t lock_size = 0;
|
734
747
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
735
748
|
data_size += lt.size;
|
736
749
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
@@ -740,11 +753,6 @@ struct llama_model_loader {
|
|
740
753
|
|
741
754
|
if (use_mmap) {
|
742
755
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
743
|
-
if (!lmlock) {
|
744
|
-
// Don't call the callback since the actual loading will be lazy
|
745
|
-
// and we can't measure it.
|
746
|
-
progress_callback = NULL;
|
747
|
-
}
|
748
756
|
if (lmlock) {
|
749
757
|
lmlock->init(mapping->addr);
|
750
758
|
}
|
@@ -752,20 +760,49 @@ struct llama_model_loader {
|
|
752
760
|
|
753
761
|
size_t done_size = 0;
|
754
762
|
for (llama_load_tensor & lt : tensors_map.tensors) {
|
755
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
756
|
-
continue;
|
757
|
-
}
|
758
763
|
if (progress_callback) {
|
759
764
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
760
765
|
}
|
761
766
|
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
762
767
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
768
|
+
|
769
|
+
// allocate temp buffer if not using mmap
|
770
|
+
if (!use_mmap && lt.data == NULL) {
|
771
|
+
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
|
772
|
+
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
|
773
|
+
}
|
774
|
+
|
763
775
|
load_data_for(lt);
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
776
|
+
|
777
|
+
switch(lt.ggml_tensor->backend) {
|
778
|
+
case GGML_BACKEND_CPU:
|
779
|
+
lt.ggml_tensor->data = lt.data;
|
780
|
+
if (use_mmap && lmlock) {
|
781
|
+
lock_size += lt.size;
|
782
|
+
lmlock->grow_to(lock_size);
|
783
|
+
}
|
784
|
+
break;
|
785
|
+
#if defined(GGML_USE_CUBLAS)
|
786
|
+
case GGML_BACKEND_GPU:
|
787
|
+
case GGML_BACKEND_GPU_SPLIT:
|
788
|
+
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
789
|
+
if (!use_mmap) {
|
790
|
+
free(lt.data);
|
791
|
+
}
|
792
|
+
break;
|
793
|
+
#elif defined(GGML_USE_CLBLAST)
|
794
|
+
case GGML_BACKEND_GPU:
|
795
|
+
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
|
796
|
+
if (!use_mmap) {
|
797
|
+
free(lt.data);
|
798
|
+
}
|
799
|
+
break;
|
800
|
+
#endif
|
801
|
+
default:
|
802
|
+
continue;
|
768
803
|
}
|
804
|
+
|
805
|
+
done_size += lt.size;
|
769
806
|
}
|
770
807
|
}
|
771
808
|
|
@@ -836,7 +873,8 @@ static bool kv_cache_init(
|
|
836
873
|
const struct llama_hparams & hparams,
|
837
874
|
struct llama_kv_cache & cache,
|
838
875
|
ggml_type wtype,
|
839
|
-
int n_ctx
|
876
|
+
int n_ctx,
|
877
|
+
int n_gpu_layers) {
|
840
878
|
const int n_embd = hparams.n_embd;
|
841
879
|
const int n_layer = hparams.n_layer;
|
842
880
|
|
@@ -862,6 +900,15 @@ static bool kv_cache_init(
|
|
862
900
|
ggml_set_name(cache.k, "cache_k");
|
863
901
|
ggml_set_name(cache.v, "cache_v");
|
864
902
|
|
903
|
+
#ifdef GGML_USE_CUBLAS
|
904
|
+
if (n_gpu_layers > n_layer + 1) {
|
905
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
906
|
+
}
|
907
|
+
if (n_gpu_layers > n_layer + 2) {
|
908
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
909
|
+
}
|
910
|
+
#endif // GGML_USE_CUBLAS
|
911
|
+
|
865
912
|
return true;
|
866
913
|
}
|
867
914
|
|
@@ -872,6 +919,7 @@ struct llama_context_params llama_context_default_params() {
|
|
872
919
|
/*.gpu_layers =*/ 0,
|
873
920
|
/*.main_gpu =*/ 0,
|
874
921
|
/*.tensor_split =*/ {0},
|
922
|
+
/*.low_vram =*/ false,
|
875
923
|
/*.seed =*/ -1,
|
876
924
|
/*.f16_kv =*/ true,
|
877
925
|
/*.logits_all =*/ false,
|
@@ -980,6 +1028,7 @@ static void llama_model_load_internal(
|
|
980
1028
|
int n_gpu_layers,
|
981
1029
|
int main_gpu,
|
982
1030
|
const float * tensor_split,
|
1031
|
+
bool low_vram,
|
983
1032
|
ggml_type memory_type,
|
984
1033
|
bool use_mmap,
|
985
1034
|
bool use_mlock,
|
@@ -1005,6 +1054,12 @@ static void llama_model_load_internal(
|
|
1005
1054
|
case 40: model.type = e_model::MODEL_13B; break;
|
1006
1055
|
case 60: model.type = e_model::MODEL_30B; break;
|
1007
1056
|
case 80: model.type = e_model::MODEL_65B; break;
|
1057
|
+
default:
|
1058
|
+
{
|
1059
|
+
if (hparams.n_layer < 32) {
|
1060
|
+
model.type = e_model::MODEL_7B;
|
1061
|
+
}
|
1062
|
+
} break;
|
1008
1063
|
}
|
1009
1064
|
|
1010
1065
|
hparams.n_ctx = n_ctx;
|
@@ -1100,18 +1155,34 @@ static void llama_model_load_internal(
|
|
1100
1155
|
ml->ggml_ctx = ctx;
|
1101
1156
|
|
1102
1157
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1103
|
-
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
1104
1158
|
|
1105
1159
|
// "output" tensor
|
1106
1160
|
{
|
1161
|
+
ggml_backend backend_norm;
|
1107
1162
|
ggml_backend backend_output;
|
1108
1163
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1164
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1165
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
1166
|
+
#ifndef _WIN32
|
1167
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1168
|
+
#else
|
1169
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1170
|
+
#endif // _WIN32
|
1171
|
+
|
1109
1172
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
1110
1173
|
} else {
|
1174
|
+
backend_norm = GGML_BACKEND_CPU;
|
1111
1175
|
backend_output = GGML_BACKEND_CPU;
|
1112
1176
|
}
|
1113
1177
|
|
1178
|
+
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
|
1114
1179
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
1180
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
1181
|
+
vram_weights += ggml_nbytes(model.norm);
|
1182
|
+
}
|
1183
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
1184
|
+
vram_weights += ggml_nbytes(model.output);
|
1185
|
+
}
|
1115
1186
|
}
|
1116
1187
|
|
1117
1188
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
@@ -1141,7 +1212,7 @@ static void llama_model_load_internal(
|
|
1141
1212
|
if (backend == GGML_BACKEND_GPU) {
|
1142
1213
|
vram_weights +=
|
1143
1214
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
1144
|
-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.
|
1215
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
1145
1216
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
1146
1217
|
}
|
1147
1218
|
}
|
@@ -1169,23 +1240,49 @@ static void llama_model_load_internal(
|
|
1169
1240
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
1170
1241
|
|
1171
1242
|
(void) vram_scratch;
|
1243
|
+
(void) n_batch;
|
1172
1244
|
#ifdef GGML_USE_CUBLAS
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1245
|
+
if (low_vram) {
|
1246
|
+
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1247
|
+
ggml_cuda_set_scratch_size(0); // disable scratch
|
1248
|
+
} else {
|
1249
|
+
vram_scratch = n_batch * MB;
|
1250
|
+
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
|
+
if (n_gpu_layers > 0) {
|
1252
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
1253
|
+
__func__, vram_scratch / MB);
|
1254
|
+
}
|
1178
1255
|
}
|
1179
1256
|
#endif // GGML_USE_CUBLAS
|
1180
1257
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1181
1258
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1182
1259
|
|
1183
|
-
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
1260
|
+
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
1184
1261
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
1185
|
-
fprintf(stderr, "%s: offloading
|
1262
|
+
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1186
1263
|
}
|
1264
|
+
size_t vram_kv_cache = 0;
|
1265
|
+
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1266
|
+
if (low_vram) {
|
1267
|
+
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
1268
|
+
} else {
|
1269
|
+
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
1270
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1271
|
+
}
|
1272
|
+
}
|
1273
|
+
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
1274
|
+
if (low_vram) {
|
1275
|
+
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
1276
|
+
} else {
|
1277
|
+
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
1278
|
+
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1279
|
+
}
|
1280
|
+
}
|
1281
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1282
|
+
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1283
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
1187
1284
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1188
|
-
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
1285
|
+
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1189
1286
|
#else
|
1190
1287
|
(void) n_gpu_layers;
|
1191
1288
|
#endif
|
@@ -1196,58 +1293,15 @@ static void llama_model_load_internal(
|
|
1196
1293
|
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
1197
1294
|
}
|
1198
1295
|
|
1199
|
-
|
1200
|
-
|
1296
|
+
(void) tensor_split;
|
1201
1297
|
#if defined(GGML_USE_CUBLAS)
|
1202
1298
|
{
|
1203
1299
|
ggml_cuda_set_tensor_split(tensor_split);
|
1204
|
-
|
1205
|
-
size_t done_size = 0;
|
1206
|
-
size_t data_size = 0;
|
1207
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1208
|
-
data_size += lt.size;
|
1209
|
-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1210
|
-
done_size += lt.size;
|
1211
|
-
}
|
1212
|
-
}
|
1213
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1214
|
-
ggml_backend backend = lt.ggml_tensor->backend;
|
1215
|
-
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
1216
|
-
continue;
|
1217
|
-
}
|
1218
|
-
if (progress_callback) {
|
1219
|
-
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1220
|
-
}
|
1221
|
-
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1222
|
-
done_size += lt.size;
|
1223
|
-
}
|
1224
1300
|
}
|
1225
|
-
#elif defined(GGML_USE_CLBLAST)
|
1226
|
-
{
|
1227
|
-
size_t done_size = 0;
|
1228
|
-
size_t data_size = 0;
|
1229
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1230
|
-
data_size += lt.size;
|
1231
|
-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
1232
|
-
done_size += lt.size;
|
1233
|
-
}
|
1234
|
-
}
|
1235
|
-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
1236
|
-
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
1237
|
-
continue;
|
1238
|
-
}
|
1239
|
-
if (progress_callback) {
|
1240
|
-
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
1241
|
-
}
|
1242
|
-
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
1243
|
-
done_size += lt.size;
|
1244
|
-
}
|
1245
|
-
}
|
1246
|
-
#else
|
1247
|
-
(void) n_batch;
|
1248
|
-
(void) tensor_split;
|
1249
1301
|
#endif
|
1250
1302
|
|
1303
|
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
1304
|
+
|
1251
1305
|
if (progress_callback) {
|
1252
1306
|
progress_callback(1.0f, progress_callback_user_data);
|
1253
1307
|
}
|
@@ -1267,6 +1321,7 @@ static bool llama_model_load(
|
|
1267
1321
|
int n_gpu_layers,
|
1268
1322
|
int main_gpu,
|
1269
1323
|
float * tensor_split,
|
1324
|
+
bool low_vram,
|
1270
1325
|
ggml_type memory_type,
|
1271
1326
|
bool use_mmap,
|
1272
1327
|
bool use_mlock,
|
@@ -1274,7 +1329,7 @@ static bool llama_model_load(
|
|
1274
1329
|
llama_progress_callback progress_callback,
|
1275
1330
|
void *progress_callback_user_data) {
|
1276
1331
|
try {
|
1277
|
-
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
1332
|
+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1278
1333
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1279
1334
|
return true;
|
1280
1335
|
} catch (const std::exception & err) {
|
@@ -1350,12 +1405,33 @@ static bool llama_eval_internal(
|
|
1350
1405
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
1351
1406
|
(void) i_gpu_start;
|
1352
1407
|
|
1408
|
+
// offload functions set the tensor output backend to GPU
|
1409
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
1410
|
+
//
|
1411
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
1412
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
1413
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
1414
|
+
offload_func_t offload_func_kq = llama_nop;
|
1415
|
+
offload_func_t offload_func_v = llama_nop;
|
1416
|
+
|
1417
|
+
#ifdef GGML_USE_CUBLAS
|
1418
|
+
if (n_gpu_layers > n_layer) {
|
1419
|
+
offload_func_nr = ggml_cuda_assign_buffers;
|
1420
|
+
}
|
1421
|
+
if (n_gpu_layers > n_layer + 1) {
|
1422
|
+
offload_func_v = ggml_cuda_assign_buffers;
|
1423
|
+
}
|
1424
|
+
if (n_gpu_layers > n_layer + 2) {
|
1425
|
+
offload_func_kq = ggml_cuda_assign_buffers;
|
1426
|
+
}
|
1427
|
+
#endif // GGML_USE_CUBLAS
|
1428
|
+
|
1353
1429
|
for (int il = 0; il < n_layer; ++il) {
|
1354
1430
|
offload_func_t offload_func = llama_nop;
|
1355
1431
|
|
1356
1432
|
#ifdef GGML_USE_CUBLAS
|
1357
1433
|
if (il >= i_gpu_start) {
|
1358
|
-
offload_func = ggml_cuda_assign_buffers;
|
1434
|
+
offload_func = ggml_cuda_assign_buffers;
|
1359
1435
|
}
|
1360
1436
|
#endif // GGML_USE_CUBLAS
|
1361
1437
|
|
@@ -1378,31 +1454,42 @@ static bool llama_eval_internal(
|
|
1378
1454
|
// self-attention
|
1379
1455
|
{
|
1380
1456
|
// compute Q and K and RoPE them
|
1381
|
-
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1382
|
-
// offload_func(tmpq);
|
1383
|
-
ggml_set_name(tmpq, "tmpq");
|
1384
|
-
|
1385
1457
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
1386
|
-
|
1458
|
+
offload_func_kq(tmpk);
|
1387
1459
|
ggml_set_name(tmpk, "tmpk");
|
1388
1460
|
|
1461
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
1462
|
+
offload_func_kq(tmpq);
|
1463
|
+
ggml_set_name(tmpq, "tmpq");
|
1464
|
+
|
1389
1465
|
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1466
|
+
offload_func_kq(Kcur);
|
1390
1467
|
ggml_set_name(Kcur, "Kcur");
|
1391
1468
|
|
1392
1469
|
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
1470
|
+
offload_func_kq(Qcur);
|
1393
1471
|
ggml_set_name(Qcur, "Qcur");
|
1394
1472
|
|
1395
1473
|
// store key and value to memory
|
1396
1474
|
{
|
1397
1475
|
// compute the transposed [N, n_embd] V matrix
|
1398
|
-
|
1476
|
+
|
1477
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
1478
|
+
offload_func_v(tmpv);
|
1479
|
+
ggml_set_name(tmpv, "tmpv");
|
1480
|
+
|
1481
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
|
1482
|
+
offload_func_v(Vcur);
|
1399
1483
|
ggml_set_name(Vcur, "Vcur");
|
1400
1484
|
|
1401
1485
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
1486
|
+
offload_func_kq(k);
|
1402
1487
|
ggml_set_name(k, "k");
|
1488
|
+
|
1403
1489
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
1404
1490
|
( n_ctx)*ggml_element_size(kv_self.v),
|
1405
1491
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
1492
|
+
offload_func_v(v);
|
1406
1493
|
ggml_set_name(v, "v");
|
1407
1494
|
|
1408
1495
|
// important: storing RoPE-ed version of K in the KV cache!
|
@@ -1414,6 +1501,7 @@ static bool llama_eval_internal(
|
|
1414
1501
|
ggml_permute(ctx0,
|
1415
1502
|
Qcur,
|
1416
1503
|
0, 2, 1, 3);
|
1504
|
+
offload_func_kq(Q);
|
1417
1505
|
ggml_set_name(Q, "Q");
|
1418
1506
|
|
1419
1507
|
struct ggml_tensor * K =
|
@@ -1422,10 +1510,12 @@ static bool llama_eval_internal(
|
|
1422
1510
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
1423
1511
|
n_embd/n_head, n_head, n_past + N),
|
1424
1512
|
0, 2, 1, 3);
|
1513
|
+
offload_func_kq(K);
|
1425
1514
|
ggml_set_name(K, "K");
|
1426
1515
|
|
1427
1516
|
// K * Q
|
1428
1517
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
1518
|
+
offload_func_kq(KQ);
|
1429
1519
|
ggml_set_name(KQ, "KQ");
|
1430
1520
|
|
1431
1521
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
@@ -1434,14 +1524,17 @@ static bool llama_eval_internal(
|
|
1434
1524
|
|
1435
1525
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
1436
1526
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
1527
|
+
offload_func_kq(KQ_scaled);
|
1437
1528
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
1438
1529
|
|
1439
1530
|
// KQ_masked = mask_past(KQ_scaled)
|
1440
1531
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
1532
|
+
offload_func_kq(KQ_masked);
|
1441
1533
|
ggml_set_name(KQ_masked, "KQ_masked");
|
1442
1534
|
|
1443
1535
|
// KQ = soft_max(KQ_masked)
|
1444
1536
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
1537
|
+
offload_func_v(KQ_soft_max);
|
1445
1538
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
1446
1539
|
|
1447
1540
|
// split cached V into n_head heads
|
@@ -1451,10 +1544,12 @@ static bool llama_eval_internal(
|
|
1451
1544
|
n_ctx*ggml_element_size(kv_self.v),
|
1452
1545
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
1453
1546
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
1547
|
+
offload_func_v(V);
|
1454
1548
|
ggml_set_name(V, "V");
|
1455
1549
|
|
1456
1550
|
#if 1
|
1457
1551
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
1552
|
+
offload_func_v(KQV);
|
1458
1553
|
ggml_set_name(KQV, "KQV");
|
1459
1554
|
#else
|
1460
1555
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
@@ -1466,12 +1561,14 @@ static bool llama_eval_internal(
|
|
1466
1561
|
|
1467
1562
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
1468
1563
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
1564
|
+
offload_func_v(KQV_merged);
|
1469
1565
|
ggml_set_name(KQV_merged, "KQV_merged");
|
1470
1566
|
|
1471
1567
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
1472
1568
|
cur = ggml_cpy(ctx0,
|
1473
1569
|
KQV_merged,
|
1474
1570
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
1571
|
+
offload_func_v(cur);
|
1475
1572
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
1476
1573
|
|
1477
1574
|
// projection (no bias)
|
@@ -1483,7 +1580,6 @@ static bool llama_eval_internal(
|
|
1483
1580
|
}
|
1484
1581
|
|
1485
1582
|
lctx.use_buf(ctx0, 1);
|
1486
|
-
//ggml_cuda_set_scratch(1);
|
1487
1583
|
|
1488
1584
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
1489
1585
|
offload_func(inpFF);
|
@@ -1541,32 +1637,24 @@ static bool llama_eval_internal(
|
|
1541
1637
|
}
|
1542
1638
|
|
1543
1639
|
lctx.use_buf(ctx0, 0);
|
1544
|
-
//ggml_cuda_set_scratch(0);
|
1545
1640
|
|
1546
1641
|
// used at the end to optionally extract the embeddings
|
1547
1642
|
struct ggml_tensor * embeddings = NULL;
|
1548
1643
|
|
1549
|
-
offload_func_t offload_func = llama_nop;
|
1550
|
-
|
1551
|
-
#ifdef GGML_USE_CUBLAS
|
1552
|
-
if (n_gpu_layers > n_layer) {
|
1553
|
-
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
1554
|
-
}
|
1555
|
-
#endif // GGML_USE_CUBLAS
|
1556
1644
|
|
1557
1645
|
// norm
|
1558
1646
|
{
|
1559
1647
|
cur = ggml_rms_norm(ctx0, inpL);
|
1560
|
-
|
1648
|
+
offload_func_nr(cur);
|
1561
1649
|
ggml_set_name(cur, "rms_norm_inpL");
|
1562
1650
|
|
1563
1651
|
cur = ggml_rms_norm(ctx0, cur);
|
1564
|
-
|
1652
|
+
offload_func_nr(cur);
|
1565
1653
|
ggml_set_name(cur, "rms_norm_after");
|
1566
1654
|
|
1567
1655
|
// cur = cur*norm(broadcasted)
|
1568
1656
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1569
|
-
|
1657
|
+
offload_func_nr(cur);
|
1570
1658
|
ggml_set_name(cur, "result_norm");
|
1571
1659
|
|
1572
1660
|
embeddings = cur;
|
@@ -2174,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2174
2262
|
return -log2f(candidate.p) > *mu;
|
2175
2263
|
}));
|
2176
2264
|
|
2265
|
+
if (candidates->size == 0) {
|
2266
|
+
candidates->size = 1;
|
2267
|
+
}
|
2268
|
+
|
2177
2269
|
// Normalize the probabilities of the remaining words
|
2178
2270
|
llama_sample_softmax(ctx, candidates);
|
2179
2271
|
|
@@ -2311,7 +2403,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2311
2403
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
2312
2404
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
2313
2405
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
2406
|
+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
2407
|
+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
2314
2408
|
|
2409
|
+
#ifdef GGML_USE_K_QUANTS
|
2315
2410
|
// K-quants
|
2316
2411
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
2317
2412
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
@@ -2322,6 +2417,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2322
2417
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
2323
2418
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
2324
2419
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
2420
|
+
#endif
|
2325
2421
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
2326
2422
|
}
|
2327
2423
|
|
@@ -2333,6 +2429,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2333
2429
|
/*vocab_only*/ false));
|
2334
2430
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
2335
2431
|
|
2432
|
+
#ifdef GGML_USE_K_QUANTS
|
2336
2433
|
int n_attention_wv = 0;
|
2337
2434
|
int n_feed_forward_w2 = 0;
|
2338
2435
|
for (auto& tensor : model_loader->tensors_map.tensors) {
|
@@ -2346,6 +2443,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2346
2443
|
|
2347
2444
|
int i_attention_wv = 0;
|
2348
2445
|
int i_feed_forward_w2 = 0;
|
2446
|
+
#endif
|
2349
2447
|
|
2350
2448
|
size_t total_size_org = 0;
|
2351
2449
|
size_t total_size_new = 0;
|
@@ -2371,12 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2371
2469
|
|
2372
2470
|
// quantize only 2D tensors
|
2373
2471
|
quantize &= (tensor.ne.size() == 2);
|
2374
|
-
|
2375
|
-
|
2376
|
-
if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
2377
|
-
quantize = false;
|
2378
|
-
}
|
2379
|
-
quantize = quantize && quantized_type != tensor.type;
|
2472
|
+
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
2473
|
+
quantize &= quantized_type != tensor.type;
|
2380
2474
|
|
2381
2475
|
enum ggml_type new_type;
|
2382
2476
|
void * new_data;
|
@@ -2390,31 +2484,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2390
2484
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
2391
2485
|
} else {
|
2392
2486
|
new_type = quantized_type;
|
2393
|
-
|
2394
|
-
|
2395
|
-
|
2396
|
-
|
2397
|
-
//}
|
2398
|
-
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2487
|
+
#ifdef GGML_USE_K_QUANTS
|
2488
|
+
if (tensor.name == "output.weight") {
|
2489
|
+
new_type = GGML_TYPE_Q6_K;
|
2490
|
+
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2399
2491
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2400
2492
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2401
2493
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2402
2494
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
2403
2495
|
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2404
2496
|
++i_attention_wv;
|
2405
|
-
}
|
2406
|
-
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2497
|
+
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
2407
2498
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2408
2499
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2409
2500
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
2410
2501
|
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
2411
2502
|
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
2412
2503
|
++i_feed_forward_w2;
|
2413
|
-
}
|
2414
|
-
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2504
|
+
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
2415
2505
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2416
2506
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
2417
2507
|
}
|
2508
|
+
#endif
|
2418
2509
|
|
2419
2510
|
float * f32_data;
|
2420
2511
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
@@ -2554,8 +2645,8 @@ struct llama_context * llama_init_from_file(
|
|
2554
2645
|
|
2555
2646
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2556
2647
|
|
2557
|
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2558
|
-
params.
|
2648
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2649
|
+
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2559
2650
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2560
2651
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2561
2652
|
llama_free(ctx);
|
@@ -2564,7 +2655,7 @@ struct llama_context * llama_init_from_file(
|
|
2564
2655
|
|
2565
2656
|
// reserve memory for context buffers
|
2566
2657
|
if (!params.vocab_only) {
|
2567
|
-
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
2658
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2568
2659
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2569
2660
|
llama_free(ctx);
|
2570
2661
|
return nullptr;
|
@@ -3301,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3301
3392
|
return ctx->model.hparams.n_embd;
|
3302
3393
|
}
|
3303
3394
|
|
3395
|
+
int llama_get_vocab(
|
3396
|
+
const struct llama_context * ctx,
|
3397
|
+
const char * * strings,
|
3398
|
+
float * scores,
|
3399
|
+
int capacity) {
|
3400
|
+
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
3401
|
+
for (int i = 0; i<n; ++i) {
|
3402
|
+
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
3403
|
+
scores[i] = ctx->vocab.id_to_token[i].score;
|
3404
|
+
}
|
3405
|
+
return n;
|
3406
|
+
}
|
3407
|
+
|
3304
3408
|
float * llama_get_logits(struct llama_context * ctx) {
|
3305
3409
|
return ctx->logits.data();
|
3306
3410
|
}
|