llama_cpp 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -165,6 +165,11 @@ struct llama_kv_cache {
165
165
  if (ctx) {
166
166
  ggml_free(ctx);
167
167
  }
168
+
169
+ #ifdef GGML_USE_CUBLAS
170
+ ggml_cuda_free_data(k);
171
+ ggml_cuda_free_data(v);
172
+ #endif // GGML_USE_CUBLAS
168
173
  }
169
174
  };
170
175
 
@@ -210,6 +215,7 @@ struct llama_model {
210
215
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
216
  ggml_cuda_free_data(tensors_by_name[i].second);
212
217
  }
218
+ ggml_cuda_free_scratch();
213
219
  #elif defined(GGML_USE_CLBLAST)
214
220
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
221
  ggml_cl_free_data(tensors_by_name[i].second);
@@ -707,6 +713,9 @@ struct llama_model_loader {
707
713
 
708
714
  struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
709
715
  struct ggml_tensor * tensor;
716
+ if (backend != GGML_BACKEND_CPU) {
717
+ ggml_set_no_alloc(ggml_ctx, true);
718
+ }
710
719
  if (lt.ne.size() == 2) {
711
720
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
712
721
  } else {
@@ -716,6 +725,9 @@ struct llama_model_loader {
716
725
  ggml_set_name(tensor, lt.name.c_str());
717
726
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
727
 
728
+ if (backend != GGML_BACKEND_CPU) {
729
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
730
+ }
719
731
  tensor->backend = backend;
720
732
  lt.ggml_tensor = tensor;
721
733
  num_ggml_tensors_created++;
@@ -731,6 +743,7 @@ struct llama_model_loader {
731
743
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732
744
  size_t data_size = 0;
733
745
  size_t prefetch_size = 0;
746
+ size_t lock_size = 0;
734
747
  for (const llama_load_tensor & lt : tensors_map.tensors) {
735
748
  data_size += lt.size;
736
749
  if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -740,11 +753,6 @@ struct llama_model_loader {
740
753
 
741
754
  if (use_mmap) {
742
755
  mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
743
- if (!lmlock) {
744
- // Don't call the callback since the actual loading will be lazy
745
- // and we can't measure it.
746
- progress_callback = NULL;
747
- }
748
756
  if (lmlock) {
749
757
  lmlock->init(mapping->addr);
750
758
  }
@@ -752,20 +760,49 @@ struct llama_model_loader {
752
760
 
753
761
  size_t done_size = 0;
754
762
  for (llama_load_tensor & lt : tensors_map.tensors) {
755
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
756
- continue;
757
- }
758
763
  if (progress_callback) {
759
764
  progress_callback((float) done_size / data_size, progress_callback_user_data);
760
765
  }
761
766
  LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
762
767
  lt.data = (uint8_t *) lt.ggml_tensor->data;
768
+
769
+ // allocate temp buffer if not using mmap
770
+ if (!use_mmap && lt.data == NULL) {
771
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
772
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
773
+ }
774
+
763
775
  load_data_for(lt);
764
- lt.ggml_tensor->data = lt.data;
765
- done_size += lt.size;
766
- if (use_mmap && lmlock) {
767
- lmlock->grow_to(done_size);
776
+
777
+ switch(lt.ggml_tensor->backend) {
778
+ case GGML_BACKEND_CPU:
779
+ lt.ggml_tensor->data = lt.data;
780
+ if (use_mmap && lmlock) {
781
+ lock_size += lt.size;
782
+ lmlock->grow_to(lock_size);
783
+ }
784
+ break;
785
+ #if defined(GGML_USE_CUBLAS)
786
+ case GGML_BACKEND_GPU:
787
+ case GGML_BACKEND_GPU_SPLIT:
788
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
789
+ if (!use_mmap) {
790
+ free(lt.data);
791
+ }
792
+ break;
793
+ #elif defined(GGML_USE_CLBLAST)
794
+ case GGML_BACKEND_GPU:
795
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
796
+ if (!use_mmap) {
797
+ free(lt.data);
798
+ }
799
+ break;
800
+ #endif
801
+ default:
802
+ continue;
768
803
  }
804
+
805
+ done_size += lt.size;
769
806
  }
770
807
  }
771
808
 
@@ -836,7 +873,8 @@ static bool kv_cache_init(
836
873
  const struct llama_hparams & hparams,
837
874
  struct llama_kv_cache & cache,
838
875
  ggml_type wtype,
839
- int n_ctx) {
876
+ int n_ctx,
877
+ int n_gpu_layers) {
840
878
  const int n_embd = hparams.n_embd;
841
879
  const int n_layer = hparams.n_layer;
842
880
 
@@ -862,6 +900,15 @@ static bool kv_cache_init(
862
900
  ggml_set_name(cache.k, "cache_k");
863
901
  ggml_set_name(cache.v, "cache_v");
864
902
 
903
+ #ifdef GGML_USE_CUBLAS
904
+ if (n_gpu_layers > n_layer + 1) {
905
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
906
+ }
907
+ if (n_gpu_layers > n_layer + 2) {
908
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
909
+ }
910
+ #endif // GGML_USE_CUBLAS
911
+
865
912
  return true;
866
913
  }
867
914
 
@@ -872,6 +919,7 @@ struct llama_context_params llama_context_default_params() {
872
919
  /*.gpu_layers =*/ 0,
873
920
  /*.main_gpu =*/ 0,
874
921
  /*.tensor_split =*/ {0},
922
+ /*.low_vram =*/ false,
875
923
  /*.seed =*/ -1,
876
924
  /*.f16_kv =*/ true,
877
925
  /*.logits_all =*/ false,
@@ -980,6 +1028,7 @@ static void llama_model_load_internal(
980
1028
  int n_gpu_layers,
981
1029
  int main_gpu,
982
1030
  const float * tensor_split,
1031
+ bool low_vram,
983
1032
  ggml_type memory_type,
984
1033
  bool use_mmap,
985
1034
  bool use_mlock,
@@ -1005,6 +1054,12 @@ static void llama_model_load_internal(
1005
1054
  case 40: model.type = e_model::MODEL_13B; break;
1006
1055
  case 60: model.type = e_model::MODEL_30B; break;
1007
1056
  case 80: model.type = e_model::MODEL_65B; break;
1057
+ default:
1058
+ {
1059
+ if (hparams.n_layer < 32) {
1060
+ model.type = e_model::MODEL_7B;
1061
+ }
1062
+ } break;
1008
1063
  }
1009
1064
 
1010
1065
  hparams.n_ctx = n_ctx;
@@ -1100,18 +1155,34 @@ static void llama_model_load_internal(
1100
1155
  ml->ggml_ctx = ctx;
1101
1156
 
1102
1157
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1103
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1104
1158
 
1105
1159
  // "output" tensor
1106
1160
  {
1161
+ ggml_backend backend_norm;
1107
1162
  ggml_backend backend_output;
1108
1163
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1164
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1165
+ // on Windows however this is detrimental unless everything is on the GPU
1166
+ #ifndef _WIN32
1167
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1168
+ #else
1169
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1170
+ #endif // _WIN32
1171
+
1109
1172
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1110
1173
  } else {
1174
+ backend_norm = GGML_BACKEND_CPU;
1111
1175
  backend_output = GGML_BACKEND_CPU;
1112
1176
  }
1113
1177
 
1178
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1114
1179
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1180
+ if (backend_norm == GGML_BACKEND_GPU) {
1181
+ vram_weights += ggml_nbytes(model.norm);
1182
+ }
1183
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1184
+ vram_weights += ggml_nbytes(model.output);
1185
+ }
1115
1186
  }
1116
1187
 
1117
1188
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1141,7 +1212,7 @@ static void llama_model_load_internal(
1141
1212
  if (backend == GGML_BACKEND_GPU) {
1142
1213
  vram_weights +=
1143
1214
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1144
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1215
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1145
1216
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1146
1217
  }
1147
1218
  }
@@ -1169,23 +1240,49 @@ static void llama_model_load_internal(
1169
1240
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1170
1241
 
1171
1242
  (void) vram_scratch;
1243
+ (void) n_batch;
1172
1244
  #ifdef GGML_USE_CUBLAS
1173
- vram_scratch = n_batch * MB;
1174
- ggml_cuda_set_scratch_size(vram_scratch);
1175
- if (n_gpu_layers > 0) {
1176
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
- __func__, vram_scratch / MB);
1245
+ if (low_vram) {
1246
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1247
+ ggml_cuda_set_scratch_size(0); // disable scratch
1248
+ } else {
1249
+ vram_scratch = n_batch * MB;
1250
+ ggml_cuda_set_scratch_size(vram_scratch);
1251
+ if (n_gpu_layers > 0) {
1252
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1253
+ __func__, vram_scratch / MB);
1254
+ }
1178
1255
  }
1179
1256
  #endif // GGML_USE_CUBLAS
1180
1257
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1181
1258
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1182
1259
 
1183
- fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1260
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1184
1261
  if (n_gpu_layers > (int) hparams.n_layer) {
1185
- fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1262
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1186
1263
  }
1264
+ size_t vram_kv_cache = 0;
1265
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1266
+ if (low_vram) {
1267
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1268
+ } else {
1269
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1270
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1271
+ }
1272
+ }
1273
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1274
+ if (low_vram) {
1275
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1276
+ } else {
1277
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1278
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1279
+ }
1280
+ }
1281
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1282
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1283
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1187
1284
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
- __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1285
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1189
1286
  #else
1190
1287
  (void) n_gpu_layers;
1191
1288
  #endif
@@ -1196,58 +1293,15 @@ static void llama_model_load_internal(
1196
1293
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1197
1294
  }
1198
1295
 
1199
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1200
-
1296
+ (void) tensor_split;
1201
1297
  #if defined(GGML_USE_CUBLAS)
1202
1298
  {
1203
1299
  ggml_cuda_set_tensor_split(tensor_split);
1204
-
1205
- size_t done_size = 0;
1206
- size_t data_size = 0;
1207
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1208
- data_size += lt.size;
1209
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1210
- done_size += lt.size;
1211
- }
1212
- }
1213
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1214
- ggml_backend backend = lt.ggml_tensor->backend;
1215
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216
- continue;
1217
- }
1218
- if (progress_callback) {
1219
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1220
- }
1221
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1222
- done_size += lt.size;
1223
- }
1224
1300
  }
1225
- #elif defined(GGML_USE_CLBLAST)
1226
- {
1227
- size_t done_size = 0;
1228
- size_t data_size = 0;
1229
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
- data_size += lt.size;
1231
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
- done_size += lt.size;
1233
- }
1234
- }
1235
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
- if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
- continue;
1238
- }
1239
- if (progress_callback) {
1240
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
- }
1242
- ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
- done_size += lt.size;
1244
- }
1245
- }
1246
- #else
1247
- (void) n_batch;
1248
- (void) tensor_split;
1249
1301
  #endif
1250
1302
 
1303
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1304
+
1251
1305
  if (progress_callback) {
1252
1306
  progress_callback(1.0f, progress_callback_user_data);
1253
1307
  }
@@ -1267,6 +1321,7 @@ static bool llama_model_load(
1267
1321
  int n_gpu_layers,
1268
1322
  int main_gpu,
1269
1323
  float * tensor_split,
1324
+ bool low_vram,
1270
1325
  ggml_type memory_type,
1271
1326
  bool use_mmap,
1272
1327
  bool use_mlock,
@@ -1274,7 +1329,7 @@ static bool llama_model_load(
1274
1329
  llama_progress_callback progress_callback,
1275
1330
  void *progress_callback_user_data) {
1276
1331
  try {
1277
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1332
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1278
1333
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1279
1334
  return true;
1280
1335
  } catch (const std::exception & err) {
@@ -1350,12 +1405,33 @@ static bool llama_eval_internal(
1350
1405
  const int i_gpu_start = n_layer - n_gpu_layers;
1351
1406
  (void) i_gpu_start;
1352
1407
 
1408
+ // offload functions set the tensor output backend to GPU
1409
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1410
+ //
1411
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1412
+ // in that case ggml_cuda_assign_buffers has no effect
1413
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1414
+ offload_func_t offload_func_kq = llama_nop;
1415
+ offload_func_t offload_func_v = llama_nop;
1416
+
1417
+ #ifdef GGML_USE_CUBLAS
1418
+ if (n_gpu_layers > n_layer) {
1419
+ offload_func_nr = ggml_cuda_assign_buffers;
1420
+ }
1421
+ if (n_gpu_layers > n_layer + 1) {
1422
+ offload_func_v = ggml_cuda_assign_buffers;
1423
+ }
1424
+ if (n_gpu_layers > n_layer + 2) {
1425
+ offload_func_kq = ggml_cuda_assign_buffers;
1426
+ }
1427
+ #endif // GGML_USE_CUBLAS
1428
+
1353
1429
  for (int il = 0; il < n_layer; ++il) {
1354
1430
  offload_func_t offload_func = llama_nop;
1355
1431
 
1356
1432
  #ifdef GGML_USE_CUBLAS
1357
1433
  if (il >= i_gpu_start) {
1358
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1434
+ offload_func = ggml_cuda_assign_buffers;
1359
1435
  }
1360
1436
  #endif // GGML_USE_CUBLAS
1361
1437
 
@@ -1378,31 +1454,42 @@ static bool llama_eval_internal(
1378
1454
  // self-attention
1379
1455
  {
1380
1456
  // compute Q and K and RoPE them
1381
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
- // offload_func(tmpq);
1383
- ggml_set_name(tmpq, "tmpq");
1384
-
1385
1457
  struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
- // offload_func(tmpk);
1458
+ offload_func_kq(tmpk);
1387
1459
  ggml_set_name(tmpk, "tmpk");
1388
1460
 
1461
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1462
+ offload_func_kq(tmpq);
1463
+ ggml_set_name(tmpq, "tmpq");
1464
+
1389
1465
  struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1466
+ offload_func_kq(Kcur);
1390
1467
  ggml_set_name(Kcur, "Kcur");
1391
1468
 
1392
1469
  struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1470
+ offload_func_kq(Qcur);
1393
1471
  ggml_set_name(Qcur, "Qcur");
1394
1472
 
1395
1473
  // store key and value to memory
1396
1474
  {
1397
1475
  // compute the transposed [N, n_embd] V matrix
1398
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1476
+
1477
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1478
+ offload_func_v(tmpv);
1479
+ ggml_set_name(tmpv, "tmpv");
1480
+
1481
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1482
+ offload_func_v(Vcur);
1399
1483
  ggml_set_name(Vcur, "Vcur");
1400
1484
 
1401
1485
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1486
+ offload_func_kq(k);
1402
1487
  ggml_set_name(k, "k");
1488
+
1403
1489
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1404
1490
  ( n_ctx)*ggml_element_size(kv_self.v),
1405
1491
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1492
+ offload_func_v(v);
1406
1493
  ggml_set_name(v, "v");
1407
1494
 
1408
1495
  // important: storing RoPE-ed version of K in the KV cache!
@@ -1414,6 +1501,7 @@ static bool llama_eval_internal(
1414
1501
  ggml_permute(ctx0,
1415
1502
  Qcur,
1416
1503
  0, 2, 1, 3);
1504
+ offload_func_kq(Q);
1417
1505
  ggml_set_name(Q, "Q");
1418
1506
 
1419
1507
  struct ggml_tensor * K =
@@ -1422,10 +1510,12 @@ static bool llama_eval_internal(
1422
1510
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1423
1511
  n_embd/n_head, n_head, n_past + N),
1424
1512
  0, 2, 1, 3);
1513
+ offload_func_kq(K);
1425
1514
  ggml_set_name(K, "K");
1426
1515
 
1427
1516
  // K * Q
1428
1517
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1518
+ offload_func_kq(KQ);
1429
1519
  ggml_set_name(KQ, "KQ");
1430
1520
 
1431
1521
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1434,14 +1524,17 @@ static bool llama_eval_internal(
1434
1524
 
1435
1525
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1436
1526
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1527
+ offload_func_kq(KQ_scaled);
1437
1528
  ggml_set_name(KQ_scaled, "KQ_scaled");
1438
1529
 
1439
1530
  // KQ_masked = mask_past(KQ_scaled)
1440
1531
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1532
+ offload_func_kq(KQ_masked);
1441
1533
  ggml_set_name(KQ_masked, "KQ_masked");
1442
1534
 
1443
1535
  // KQ = soft_max(KQ_masked)
1444
1536
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1537
+ offload_func_v(KQ_soft_max);
1445
1538
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1446
1539
 
1447
1540
  // split cached V into n_head heads
@@ -1451,10 +1544,12 @@ static bool llama_eval_internal(
1451
1544
  n_ctx*ggml_element_size(kv_self.v),
1452
1545
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1453
1546
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1547
+ offload_func_v(V);
1454
1548
  ggml_set_name(V, "V");
1455
1549
 
1456
1550
  #if 1
1457
1551
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1552
+ offload_func_v(KQV);
1458
1553
  ggml_set_name(KQV, "KQV");
1459
1554
  #else
1460
1555
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1466,12 +1561,14 @@ static bool llama_eval_internal(
1466
1561
 
1467
1562
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1468
1563
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1564
+ offload_func_v(KQV_merged);
1469
1565
  ggml_set_name(KQV_merged, "KQV_merged");
1470
1566
 
1471
1567
  // cur = KQV_merged.contiguous().view(n_embd, N)
1472
1568
  cur = ggml_cpy(ctx0,
1473
1569
  KQV_merged,
1474
1570
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1571
+ offload_func_v(cur);
1475
1572
  ggml_set_name(cur, "KQV_merged_contiguous");
1476
1573
 
1477
1574
  // projection (no bias)
@@ -1483,7 +1580,6 @@ static bool llama_eval_internal(
1483
1580
  }
1484
1581
 
1485
1582
  lctx.use_buf(ctx0, 1);
1486
- //ggml_cuda_set_scratch(1);
1487
1583
 
1488
1584
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
1585
  offload_func(inpFF);
@@ -1541,32 +1637,24 @@ static bool llama_eval_internal(
1541
1637
  }
1542
1638
 
1543
1639
  lctx.use_buf(ctx0, 0);
1544
- //ggml_cuda_set_scratch(0);
1545
1640
 
1546
1641
  // used at the end to optionally extract the embeddings
1547
1642
  struct ggml_tensor * embeddings = NULL;
1548
1643
 
1549
- offload_func_t offload_func = llama_nop;
1550
-
1551
- #ifdef GGML_USE_CUBLAS
1552
- if (n_gpu_layers > n_layer) {
1553
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
- }
1555
- #endif // GGML_USE_CUBLAS
1556
1644
 
1557
1645
  // norm
1558
1646
  {
1559
1647
  cur = ggml_rms_norm(ctx0, inpL);
1560
- offload_func(cur);
1648
+ offload_func_nr(cur);
1561
1649
  ggml_set_name(cur, "rms_norm_inpL");
1562
1650
 
1563
1651
  cur = ggml_rms_norm(ctx0, cur);
1564
- offload_func(cur);
1652
+ offload_func_nr(cur);
1565
1653
  ggml_set_name(cur, "rms_norm_after");
1566
1654
 
1567
1655
  // cur = cur*norm(broadcasted)
1568
1656
  cur = ggml_mul(ctx0, cur, model.norm);
1569
- offload_func(cur);
1657
+ offload_func_nr(cur);
1570
1658
  ggml_set_name(cur, "result_norm");
1571
1659
 
1572
1660
  embeddings = cur;
@@ -2174,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2174
2262
  return -log2f(candidate.p) > *mu;
2175
2263
  }));
2176
2264
 
2265
+ if (candidates->size == 0) {
2266
+ candidates->size = 1;
2267
+ }
2268
+
2177
2269
  // Normalize the probabilities of the remaining words
2178
2270
  llama_sample_softmax(ctx, candidates);
2179
2271
 
@@ -2311,7 +2403,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2311
2403
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2312
2404
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2313
2405
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2406
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2407
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2314
2408
 
2409
+ #ifdef GGML_USE_K_QUANTS
2315
2410
  // K-quants
2316
2411
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
2412
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -2322,6 +2417,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2322
2417
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
2418
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
2419
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2420
+ #endif
2325
2421
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
2422
  }
2327
2423
 
@@ -2333,6 +2429,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2333
2429
  /*vocab_only*/ false));
2334
2430
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
2431
 
2432
+ #ifdef GGML_USE_K_QUANTS
2336
2433
  int n_attention_wv = 0;
2337
2434
  int n_feed_forward_w2 = 0;
2338
2435
  for (auto& tensor : model_loader->tensors_map.tensors) {
@@ -2346,6 +2443,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2346
2443
 
2347
2444
  int i_attention_wv = 0;
2348
2445
  int i_feed_forward_w2 = 0;
2446
+ #endif
2349
2447
 
2350
2448
  size_t total_size_org = 0;
2351
2449
  size_t total_size_new = 0;
@@ -2371,12 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2371
2469
 
2372
2470
  // quantize only 2D tensors
2373
2471
  quantize &= (tensor.ne.size() == 2);
2374
-
2375
- // uncomment this to keep the output layer in FP16
2376
- if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
- quantize = false;
2378
- }
2379
- quantize = quantize && quantized_type != tensor.type;
2472
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
2473
+ quantize &= quantized_type != tensor.type;
2380
2474
 
2381
2475
  enum ggml_type new_type;
2382
2476
  void * new_data;
@@ -2390,31 +2484,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2390
2484
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2391
2485
  } else {
2392
2486
  new_type = quantized_type;
2393
- // TODO: temporary disabled until Metal / OpenCL support is available
2394
- // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
- //if (tensor.name == "output.weight") {
2396
- // new_type = GGML_TYPE_Q6_K;
2397
- //}
2398
- if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2487
+ #ifdef GGML_USE_K_QUANTS
2488
+ if (tensor.name == "output.weight") {
2489
+ new_type = GGML_TYPE_Q6_K;
2490
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
2491
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
2492
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
2493
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
2494
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
2495
  (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
2496
  ++i_attention_wv;
2405
- }
2406
- if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2497
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
2498
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
2499
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
2500
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
2501
  (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
2502
  (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
2503
  ++i_feed_forward_w2;
2413
- }
2414
- if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2504
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
2505
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
2506
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
2507
  }
2508
+ #endif
2418
2509
 
2419
2510
  float * f32_data;
2420
2511
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
@@ -2554,8 +2645,8 @@ struct llama_context * llama_init_from_file(
2554
2645
 
2555
2646
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2556
2647
 
2557
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
- params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2648
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2649
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2559
2650
  params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2560
2651
  fprintf(stderr, "%s: failed to load model\n", __func__);
2561
2652
  llama_free(ctx);
@@ -2564,7 +2655,7 @@ struct llama_context * llama_init_from_file(
2564
2655
 
2565
2656
  // reserve memory for context buffers
2566
2657
  if (!params.vocab_only) {
2567
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2658
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2568
2659
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2569
2660
  llama_free(ctx);
2570
2661
  return nullptr;
@@ -3301,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
3301
3392
  return ctx->model.hparams.n_embd;
3302
3393
  }
3303
3394
 
3395
+ int llama_get_vocab(
3396
+ const struct llama_context * ctx,
3397
+ const char * * strings,
3398
+ float * scores,
3399
+ int capacity) {
3400
+ int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3401
+ for (int i = 0; i<n; ++i) {
3402
+ strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3403
+ scores[i] = ctx->vocab.id_to_token[i].score;
3404
+ }
3405
+ return n;
3406
+ }
3407
+
3304
3408
  float * llama_get_logits(struct llama_context * ctx) {
3305
3409
  return ctx->logits.data();
3306
3410
  }