llama_cpp 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -165,6 +165,11 @@ struct llama_kv_cache {
165
165
  if (ctx) {
166
166
  ggml_free(ctx);
167
167
  }
168
+
169
+ #ifdef GGML_USE_CUBLAS
170
+ ggml_cuda_free_data(k);
171
+ ggml_cuda_free_data(v);
172
+ #endif // GGML_USE_CUBLAS
168
173
  }
169
174
  };
170
175
 
@@ -210,6 +215,7 @@ struct llama_model {
210
215
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
216
  ggml_cuda_free_data(tensors_by_name[i].second);
212
217
  }
218
+ ggml_cuda_free_scratch();
213
219
  #elif defined(GGML_USE_CLBLAST)
214
220
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
221
  ggml_cl_free_data(tensors_by_name[i].second);
@@ -707,6 +713,9 @@ struct llama_model_loader {
707
713
 
708
714
  struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
709
715
  struct ggml_tensor * tensor;
716
+ if (backend != GGML_BACKEND_CPU) {
717
+ ggml_set_no_alloc(ggml_ctx, true);
718
+ }
710
719
  if (lt.ne.size() == 2) {
711
720
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
712
721
  } else {
@@ -716,6 +725,9 @@ struct llama_model_loader {
716
725
  ggml_set_name(tensor, lt.name.c_str());
717
726
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
727
 
728
+ if (backend != GGML_BACKEND_CPU) {
729
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
730
+ }
719
731
  tensor->backend = backend;
720
732
  lt.ggml_tensor = tensor;
721
733
  num_ggml_tensors_created++;
@@ -731,6 +743,7 @@ struct llama_model_loader {
731
743
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732
744
  size_t data_size = 0;
733
745
  size_t prefetch_size = 0;
746
+ size_t lock_size = 0;
734
747
  for (const llama_load_tensor & lt : tensors_map.tensors) {
735
748
  data_size += lt.size;
736
749
  if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -740,11 +753,6 @@ struct llama_model_loader {
740
753
 
741
754
  if (use_mmap) {
742
755
  mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
743
- if (!lmlock) {
744
- // Don't call the callback since the actual loading will be lazy
745
- // and we can't measure it.
746
- progress_callback = NULL;
747
- }
748
756
  if (lmlock) {
749
757
  lmlock->init(mapping->addr);
750
758
  }
@@ -752,20 +760,49 @@ struct llama_model_loader {
752
760
 
753
761
  size_t done_size = 0;
754
762
  for (llama_load_tensor & lt : tensors_map.tensors) {
755
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
756
- continue;
757
- }
758
763
  if (progress_callback) {
759
764
  progress_callback((float) done_size / data_size, progress_callback_user_data);
760
765
  }
761
766
  LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
762
767
  lt.data = (uint8_t *) lt.ggml_tensor->data;
768
+
769
+ // allocate temp buffer if not using mmap
770
+ if (!use_mmap && lt.data == NULL) {
771
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
772
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
773
+ }
774
+
763
775
  load_data_for(lt);
764
- lt.ggml_tensor->data = lt.data;
765
- done_size += lt.size;
766
- if (use_mmap && lmlock) {
767
- lmlock->grow_to(done_size);
776
+
777
+ switch(lt.ggml_tensor->backend) {
778
+ case GGML_BACKEND_CPU:
779
+ lt.ggml_tensor->data = lt.data;
780
+ if (use_mmap && lmlock) {
781
+ lock_size += lt.size;
782
+ lmlock->grow_to(lock_size);
783
+ }
784
+ break;
785
+ #if defined(GGML_USE_CUBLAS)
786
+ case GGML_BACKEND_GPU:
787
+ case GGML_BACKEND_GPU_SPLIT:
788
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
789
+ if (!use_mmap) {
790
+ free(lt.data);
791
+ }
792
+ break;
793
+ #elif defined(GGML_USE_CLBLAST)
794
+ case GGML_BACKEND_GPU:
795
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
796
+ if (!use_mmap) {
797
+ free(lt.data);
798
+ }
799
+ break;
800
+ #endif
801
+ default:
802
+ continue;
768
803
  }
804
+
805
+ done_size += lt.size;
769
806
  }
770
807
  }
771
808
 
@@ -836,7 +873,8 @@ static bool kv_cache_init(
836
873
  const struct llama_hparams & hparams,
837
874
  struct llama_kv_cache & cache,
838
875
  ggml_type wtype,
839
- int n_ctx) {
876
+ int n_ctx,
877
+ int n_gpu_layers) {
840
878
  const int n_embd = hparams.n_embd;
841
879
  const int n_layer = hparams.n_layer;
842
880
 
@@ -862,6 +900,15 @@ static bool kv_cache_init(
862
900
  ggml_set_name(cache.k, "cache_k");
863
901
  ggml_set_name(cache.v, "cache_v");
864
902
 
903
+ #ifdef GGML_USE_CUBLAS
904
+ if (n_gpu_layers > n_layer + 1) {
905
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
906
+ }
907
+ if (n_gpu_layers > n_layer + 2) {
908
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
909
+ }
910
+ #endif // GGML_USE_CUBLAS
911
+
865
912
  return true;
866
913
  }
867
914
 
@@ -872,6 +919,7 @@ struct llama_context_params llama_context_default_params() {
872
919
  /*.gpu_layers =*/ 0,
873
920
  /*.main_gpu =*/ 0,
874
921
  /*.tensor_split =*/ {0},
922
+ /*.low_vram =*/ false,
875
923
  /*.seed =*/ -1,
876
924
  /*.f16_kv =*/ true,
877
925
  /*.logits_all =*/ false,
@@ -980,6 +1028,7 @@ static void llama_model_load_internal(
980
1028
  int n_gpu_layers,
981
1029
  int main_gpu,
982
1030
  const float * tensor_split,
1031
+ bool low_vram,
983
1032
  ggml_type memory_type,
984
1033
  bool use_mmap,
985
1034
  bool use_mlock,
@@ -1005,6 +1054,12 @@ static void llama_model_load_internal(
1005
1054
  case 40: model.type = e_model::MODEL_13B; break;
1006
1055
  case 60: model.type = e_model::MODEL_30B; break;
1007
1056
  case 80: model.type = e_model::MODEL_65B; break;
1057
+ default:
1058
+ {
1059
+ if (hparams.n_layer < 32) {
1060
+ model.type = e_model::MODEL_7B;
1061
+ }
1062
+ } break;
1008
1063
  }
1009
1064
 
1010
1065
  hparams.n_ctx = n_ctx;
@@ -1100,18 +1155,34 @@ static void llama_model_load_internal(
1100
1155
  ml->ggml_ctx = ctx;
1101
1156
 
1102
1157
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1103
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1104
1158
 
1105
1159
  // "output" tensor
1106
1160
  {
1161
+ ggml_backend backend_norm;
1107
1162
  ggml_backend backend_output;
1108
1163
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1164
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1165
+ // on Windows however this is detrimental unless everything is on the GPU
1166
+ #ifndef _WIN32
1167
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1168
+ #else
1169
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1170
+ #endif // _WIN32
1171
+
1109
1172
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1110
1173
  } else {
1174
+ backend_norm = GGML_BACKEND_CPU;
1111
1175
  backend_output = GGML_BACKEND_CPU;
1112
1176
  }
1113
1177
 
1178
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1114
1179
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1180
+ if (backend_norm == GGML_BACKEND_GPU) {
1181
+ vram_weights += ggml_nbytes(model.norm);
1182
+ }
1183
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1184
+ vram_weights += ggml_nbytes(model.output);
1185
+ }
1115
1186
  }
1116
1187
 
1117
1188
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1141,7 +1212,7 @@ static void llama_model_load_internal(
1141
1212
  if (backend == GGML_BACKEND_GPU) {
1142
1213
  vram_weights +=
1143
1214
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1144
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1215
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1145
1216
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1146
1217
  }
1147
1218
  }
@@ -1169,23 +1240,49 @@ static void llama_model_load_internal(
1169
1240
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1170
1241
 
1171
1242
  (void) vram_scratch;
1243
+ (void) n_batch;
1172
1244
  #ifdef GGML_USE_CUBLAS
1173
- vram_scratch = n_batch * MB;
1174
- ggml_cuda_set_scratch_size(vram_scratch);
1175
- if (n_gpu_layers > 0) {
1176
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
- __func__, vram_scratch / MB);
1245
+ if (low_vram) {
1246
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1247
+ ggml_cuda_set_scratch_size(0); // disable scratch
1248
+ } else {
1249
+ vram_scratch = n_batch * MB;
1250
+ ggml_cuda_set_scratch_size(vram_scratch);
1251
+ if (n_gpu_layers > 0) {
1252
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1253
+ __func__, vram_scratch / MB);
1254
+ }
1178
1255
  }
1179
1256
  #endif // GGML_USE_CUBLAS
1180
1257
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1181
1258
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1182
1259
 
1183
- fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1260
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1184
1261
  if (n_gpu_layers > (int) hparams.n_layer) {
1185
- fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1262
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1186
1263
  }
1264
+ size_t vram_kv_cache = 0;
1265
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1266
+ if (low_vram) {
1267
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1268
+ } else {
1269
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1270
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1271
+ }
1272
+ }
1273
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1274
+ if (low_vram) {
1275
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1276
+ } else {
1277
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1278
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1279
+ }
1280
+ }
1281
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1282
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1283
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1187
1284
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
- __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1285
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1189
1286
  #else
1190
1287
  (void) n_gpu_layers;
1191
1288
  #endif
@@ -1196,58 +1293,15 @@ static void llama_model_load_internal(
1196
1293
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1197
1294
  }
1198
1295
 
1199
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1200
-
1296
+ (void) tensor_split;
1201
1297
  #if defined(GGML_USE_CUBLAS)
1202
1298
  {
1203
1299
  ggml_cuda_set_tensor_split(tensor_split);
1204
-
1205
- size_t done_size = 0;
1206
- size_t data_size = 0;
1207
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1208
- data_size += lt.size;
1209
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1210
- done_size += lt.size;
1211
- }
1212
- }
1213
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1214
- ggml_backend backend = lt.ggml_tensor->backend;
1215
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216
- continue;
1217
- }
1218
- if (progress_callback) {
1219
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1220
- }
1221
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1222
- done_size += lt.size;
1223
- }
1224
1300
  }
1225
- #elif defined(GGML_USE_CLBLAST)
1226
- {
1227
- size_t done_size = 0;
1228
- size_t data_size = 0;
1229
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
- data_size += lt.size;
1231
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
- done_size += lt.size;
1233
- }
1234
- }
1235
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
- if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
- continue;
1238
- }
1239
- if (progress_callback) {
1240
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
- }
1242
- ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
- done_size += lt.size;
1244
- }
1245
- }
1246
- #else
1247
- (void) n_batch;
1248
- (void) tensor_split;
1249
1301
  #endif
1250
1302
 
1303
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1304
+
1251
1305
  if (progress_callback) {
1252
1306
  progress_callback(1.0f, progress_callback_user_data);
1253
1307
  }
@@ -1267,6 +1321,7 @@ static bool llama_model_load(
1267
1321
  int n_gpu_layers,
1268
1322
  int main_gpu,
1269
1323
  float * tensor_split,
1324
+ bool low_vram,
1270
1325
  ggml_type memory_type,
1271
1326
  bool use_mmap,
1272
1327
  bool use_mlock,
@@ -1274,7 +1329,7 @@ static bool llama_model_load(
1274
1329
  llama_progress_callback progress_callback,
1275
1330
  void *progress_callback_user_data) {
1276
1331
  try {
1277
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1332
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1278
1333
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1279
1334
  return true;
1280
1335
  } catch (const std::exception & err) {
@@ -1350,12 +1405,33 @@ static bool llama_eval_internal(
1350
1405
  const int i_gpu_start = n_layer - n_gpu_layers;
1351
1406
  (void) i_gpu_start;
1352
1407
 
1408
+ // offload functions set the tensor output backend to GPU
1409
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1410
+ //
1411
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1412
+ // in that case ggml_cuda_assign_buffers has no effect
1413
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1414
+ offload_func_t offload_func_kq = llama_nop;
1415
+ offload_func_t offload_func_v = llama_nop;
1416
+
1417
+ #ifdef GGML_USE_CUBLAS
1418
+ if (n_gpu_layers > n_layer) {
1419
+ offload_func_nr = ggml_cuda_assign_buffers;
1420
+ }
1421
+ if (n_gpu_layers > n_layer + 1) {
1422
+ offload_func_v = ggml_cuda_assign_buffers;
1423
+ }
1424
+ if (n_gpu_layers > n_layer + 2) {
1425
+ offload_func_kq = ggml_cuda_assign_buffers;
1426
+ }
1427
+ #endif // GGML_USE_CUBLAS
1428
+
1353
1429
  for (int il = 0; il < n_layer; ++il) {
1354
1430
  offload_func_t offload_func = llama_nop;
1355
1431
 
1356
1432
  #ifdef GGML_USE_CUBLAS
1357
1433
  if (il >= i_gpu_start) {
1358
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1434
+ offload_func = ggml_cuda_assign_buffers;
1359
1435
  }
1360
1436
  #endif // GGML_USE_CUBLAS
1361
1437
 
@@ -1378,31 +1454,42 @@ static bool llama_eval_internal(
1378
1454
  // self-attention
1379
1455
  {
1380
1456
  // compute Q and K and RoPE them
1381
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
- // offload_func(tmpq);
1383
- ggml_set_name(tmpq, "tmpq");
1384
-
1385
1457
  struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
- // offload_func(tmpk);
1458
+ offload_func_kq(tmpk);
1387
1459
  ggml_set_name(tmpk, "tmpk");
1388
1460
 
1461
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1462
+ offload_func_kq(tmpq);
1463
+ ggml_set_name(tmpq, "tmpq");
1464
+
1389
1465
  struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1466
+ offload_func_kq(Kcur);
1390
1467
  ggml_set_name(Kcur, "Kcur");
1391
1468
 
1392
1469
  struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1470
+ offload_func_kq(Qcur);
1393
1471
  ggml_set_name(Qcur, "Qcur");
1394
1472
 
1395
1473
  // store key and value to memory
1396
1474
  {
1397
1475
  // compute the transposed [N, n_embd] V matrix
1398
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1476
+
1477
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1478
+ offload_func_v(tmpv);
1479
+ ggml_set_name(tmpv, "tmpv");
1480
+
1481
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1482
+ offload_func_v(Vcur);
1399
1483
  ggml_set_name(Vcur, "Vcur");
1400
1484
 
1401
1485
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1486
+ offload_func_kq(k);
1402
1487
  ggml_set_name(k, "k");
1488
+
1403
1489
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1404
1490
  ( n_ctx)*ggml_element_size(kv_self.v),
1405
1491
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1492
+ offload_func_v(v);
1406
1493
  ggml_set_name(v, "v");
1407
1494
 
1408
1495
  // important: storing RoPE-ed version of K in the KV cache!
@@ -1414,6 +1501,7 @@ static bool llama_eval_internal(
1414
1501
  ggml_permute(ctx0,
1415
1502
  Qcur,
1416
1503
  0, 2, 1, 3);
1504
+ offload_func_kq(Q);
1417
1505
  ggml_set_name(Q, "Q");
1418
1506
 
1419
1507
  struct ggml_tensor * K =
@@ -1422,10 +1510,12 @@ static bool llama_eval_internal(
1422
1510
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1423
1511
  n_embd/n_head, n_head, n_past + N),
1424
1512
  0, 2, 1, 3);
1513
+ offload_func_kq(K);
1425
1514
  ggml_set_name(K, "K");
1426
1515
 
1427
1516
  // K * Q
1428
1517
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1518
+ offload_func_kq(KQ);
1429
1519
  ggml_set_name(KQ, "KQ");
1430
1520
 
1431
1521
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1434,14 +1524,17 @@ static bool llama_eval_internal(
1434
1524
 
1435
1525
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1436
1526
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1527
+ offload_func_kq(KQ_scaled);
1437
1528
  ggml_set_name(KQ_scaled, "KQ_scaled");
1438
1529
 
1439
1530
  // KQ_masked = mask_past(KQ_scaled)
1440
1531
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1532
+ offload_func_kq(KQ_masked);
1441
1533
  ggml_set_name(KQ_masked, "KQ_masked");
1442
1534
 
1443
1535
  // KQ = soft_max(KQ_masked)
1444
1536
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1537
+ offload_func_v(KQ_soft_max);
1445
1538
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1446
1539
 
1447
1540
  // split cached V into n_head heads
@@ -1451,10 +1544,12 @@ static bool llama_eval_internal(
1451
1544
  n_ctx*ggml_element_size(kv_self.v),
1452
1545
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1453
1546
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1547
+ offload_func_v(V);
1454
1548
  ggml_set_name(V, "V");
1455
1549
 
1456
1550
  #if 1
1457
1551
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1552
+ offload_func_v(KQV);
1458
1553
  ggml_set_name(KQV, "KQV");
1459
1554
  #else
1460
1555
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1466,12 +1561,14 @@ static bool llama_eval_internal(
1466
1561
 
1467
1562
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1468
1563
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1564
+ offload_func_v(KQV_merged);
1469
1565
  ggml_set_name(KQV_merged, "KQV_merged");
1470
1566
 
1471
1567
  // cur = KQV_merged.contiguous().view(n_embd, N)
1472
1568
  cur = ggml_cpy(ctx0,
1473
1569
  KQV_merged,
1474
1570
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1571
+ offload_func_v(cur);
1475
1572
  ggml_set_name(cur, "KQV_merged_contiguous");
1476
1573
 
1477
1574
  // projection (no bias)
@@ -1483,7 +1580,6 @@ static bool llama_eval_internal(
1483
1580
  }
1484
1581
 
1485
1582
  lctx.use_buf(ctx0, 1);
1486
- //ggml_cuda_set_scratch(1);
1487
1583
 
1488
1584
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
1585
  offload_func(inpFF);
@@ -1541,32 +1637,24 @@ static bool llama_eval_internal(
1541
1637
  }
1542
1638
 
1543
1639
  lctx.use_buf(ctx0, 0);
1544
- //ggml_cuda_set_scratch(0);
1545
1640
 
1546
1641
  // used at the end to optionally extract the embeddings
1547
1642
  struct ggml_tensor * embeddings = NULL;
1548
1643
 
1549
- offload_func_t offload_func = llama_nop;
1550
-
1551
- #ifdef GGML_USE_CUBLAS
1552
- if (n_gpu_layers > n_layer) {
1553
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
- }
1555
- #endif // GGML_USE_CUBLAS
1556
1644
 
1557
1645
  // norm
1558
1646
  {
1559
1647
  cur = ggml_rms_norm(ctx0, inpL);
1560
- offload_func(cur);
1648
+ offload_func_nr(cur);
1561
1649
  ggml_set_name(cur, "rms_norm_inpL");
1562
1650
 
1563
1651
  cur = ggml_rms_norm(ctx0, cur);
1564
- offload_func(cur);
1652
+ offload_func_nr(cur);
1565
1653
  ggml_set_name(cur, "rms_norm_after");
1566
1654
 
1567
1655
  // cur = cur*norm(broadcasted)
1568
1656
  cur = ggml_mul(ctx0, cur, model.norm);
1569
- offload_func(cur);
1657
+ offload_func_nr(cur);
1570
1658
  ggml_set_name(cur, "result_norm");
1571
1659
 
1572
1660
  embeddings = cur;
@@ -2174,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2174
2262
  return -log2f(candidate.p) > *mu;
2175
2263
  }));
2176
2264
 
2265
+ if (candidates->size == 0) {
2266
+ candidates->size = 1;
2267
+ }
2268
+
2177
2269
  // Normalize the probabilities of the remaining words
2178
2270
  llama_sample_softmax(ctx, candidates);
2179
2271
 
@@ -2311,7 +2403,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2311
2403
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2312
2404
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2313
2405
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2406
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2407
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2314
2408
 
2409
+ #ifdef GGML_USE_K_QUANTS
2315
2410
  // K-quants
2316
2411
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
2412
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -2322,6 +2417,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2322
2417
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
2418
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
2419
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2420
+ #endif
2325
2421
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
2422
  }
2327
2423
 
@@ -2333,6 +2429,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2333
2429
  /*vocab_only*/ false));
2334
2430
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
2431
 
2432
+ #ifdef GGML_USE_K_QUANTS
2336
2433
  int n_attention_wv = 0;
2337
2434
  int n_feed_forward_w2 = 0;
2338
2435
  for (auto& tensor : model_loader->tensors_map.tensors) {
@@ -2346,6 +2443,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2346
2443
 
2347
2444
  int i_attention_wv = 0;
2348
2445
  int i_feed_forward_w2 = 0;
2446
+ #endif
2349
2447
 
2350
2448
  size_t total_size_org = 0;
2351
2449
  size_t total_size_new = 0;
@@ -2371,12 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2371
2469
 
2372
2470
  // quantize only 2D tensors
2373
2471
  quantize &= (tensor.ne.size() == 2);
2374
-
2375
- // uncomment this to keep the output layer in FP16
2376
- if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
- quantize = false;
2378
- }
2379
- quantize = quantize && quantized_type != tensor.type;
2472
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
2473
+ quantize &= quantized_type != tensor.type;
2380
2474
 
2381
2475
  enum ggml_type new_type;
2382
2476
  void * new_data;
@@ -2390,31 +2484,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2390
2484
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2391
2485
  } else {
2392
2486
  new_type = quantized_type;
2393
- // TODO: temporary disabled until Metal / OpenCL support is available
2394
- // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
- //if (tensor.name == "output.weight") {
2396
- // new_type = GGML_TYPE_Q6_K;
2397
- //}
2398
- if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2487
+ #ifdef GGML_USE_K_QUANTS
2488
+ if (tensor.name == "output.weight") {
2489
+ new_type = GGML_TYPE_Q6_K;
2490
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
2491
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
2492
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
2493
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
2494
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
2495
  (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
2496
  ++i_attention_wv;
2405
- }
2406
- if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2497
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
2498
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
2499
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
2500
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
2501
  (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
2502
  (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
2503
  ++i_feed_forward_w2;
2413
- }
2414
- if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2504
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
2505
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
2506
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
2507
  }
2508
+ #endif
2418
2509
 
2419
2510
  float * f32_data;
2420
2511
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
@@ -2554,8 +2645,8 @@ struct llama_context * llama_init_from_file(
2554
2645
 
2555
2646
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2556
2647
 
2557
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
- params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2648
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2649
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2559
2650
  params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2560
2651
  fprintf(stderr, "%s: failed to load model\n", __func__);
2561
2652
  llama_free(ctx);
@@ -2564,7 +2655,7 @@ struct llama_context * llama_init_from_file(
2564
2655
 
2565
2656
  // reserve memory for context buffers
2566
2657
  if (!params.vocab_only) {
2567
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2658
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2568
2659
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2569
2660
  llama_free(ctx);
2570
2661
  return nullptr;
@@ -3301,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
3301
3392
  return ctx->model.hparams.n_embd;
3302
3393
  }
3303
3394
 
3395
+ int llama_get_vocab(
3396
+ const struct llama_context * ctx,
3397
+ const char * * strings,
3398
+ float * scores,
3399
+ int capacity) {
3400
+ int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3401
+ for (int i = 0; i<n; ++i) {
3402
+ strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3403
+ scores[i] = ctx->vocab.id_to_token[i].score;
3404
+ }
3405
+ return n;
3406
+ }
3407
+
3304
3408
  float * llama_get_logits(struct llama_context * ctx) {
3305
3409
  return ctx->logits.data();
3306
3410
  }