llama_cpp 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,6 +19,11 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #define QK_K 256
25
+ #endif
26
+ #endif
22
27
 
23
28
  #include <array>
24
29
  #include <ctime>
@@ -40,6 +45,10 @@
40
45
  #include <sstream>
41
46
  #include <numeric>
42
47
 
48
+ #if defined(_MSC_VER)
49
+ #pragma warning(disable: 4244 4267) // possible loss of data
50
+ #endif
51
+
43
52
  #define LLAMA_USE_SCRATCH
44
53
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
54
 
@@ -165,6 +174,11 @@ struct llama_kv_cache {
165
174
  if (ctx) {
166
175
  ggml_free(ctx);
167
176
  }
177
+
178
+ #ifdef GGML_USE_CUBLAS
179
+ ggml_cuda_free_data(k);
180
+ ggml_cuda_free_data(v);
181
+ #endif // GGML_USE_CUBLAS
168
182
  }
169
183
  };
170
184
 
@@ -210,6 +224,7 @@ struct llama_model {
210
224
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
225
  ggml_cuda_free_data(tensors_by_name[i].second);
212
226
  }
227
+ ggml_cuda_free_scratch();
213
228
  #elif defined(GGML_USE_CLBLAST)
214
229
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
230
  ggml_cl_free_data(tensors_by_name[i].second);
@@ -707,6 +722,9 @@ struct llama_model_loader {
707
722
 
708
723
  struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
709
724
  struct ggml_tensor * tensor;
725
+ if (backend != GGML_BACKEND_CPU) {
726
+ ggml_set_no_alloc(ggml_ctx, true);
727
+ }
710
728
  if (lt.ne.size() == 2) {
711
729
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
712
730
  } else {
@@ -716,6 +734,9 @@ struct llama_model_loader {
716
734
  ggml_set_name(tensor, lt.name.c_str());
717
735
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
736
 
737
+ if (backend != GGML_BACKEND_CPU) {
738
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
739
+ }
719
740
  tensor->backend = backend;
720
741
  lt.ggml_tensor = tensor;
721
742
  num_ggml_tensors_created++;
@@ -731,6 +752,7 @@ struct llama_model_loader {
731
752
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732
753
  size_t data_size = 0;
733
754
  size_t prefetch_size = 0;
755
+ size_t lock_size = 0;
734
756
  for (const llama_load_tensor & lt : tensors_map.tensors) {
735
757
  data_size += lt.size;
736
758
  if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -740,11 +762,6 @@ struct llama_model_loader {
740
762
 
741
763
  if (use_mmap) {
742
764
  mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
743
- if (!lmlock) {
744
- // Don't call the callback since the actual loading will be lazy
745
- // and we can't measure it.
746
- progress_callback = NULL;
747
- }
748
765
  if (lmlock) {
749
766
  lmlock->init(mapping->addr);
750
767
  }
@@ -752,20 +769,49 @@ struct llama_model_loader {
752
769
 
753
770
  size_t done_size = 0;
754
771
  for (llama_load_tensor & lt : tensors_map.tensors) {
755
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
756
- continue;
757
- }
758
772
  if (progress_callback) {
759
773
  progress_callback((float) done_size / data_size, progress_callback_user_data);
760
774
  }
761
775
  LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
762
776
  lt.data = (uint8_t *) lt.ggml_tensor->data;
777
+
778
+ // allocate temp buffer if not using mmap
779
+ if (!use_mmap && lt.data == NULL) {
780
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
781
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
782
+ }
783
+
763
784
  load_data_for(lt);
764
- lt.ggml_tensor->data = lt.data;
765
- done_size += lt.size;
766
- if (use_mmap && lmlock) {
767
- lmlock->grow_to(done_size);
785
+
786
+ switch(lt.ggml_tensor->backend) {
787
+ case GGML_BACKEND_CPU:
788
+ lt.ggml_tensor->data = lt.data;
789
+ if (use_mmap && lmlock) {
790
+ lock_size += lt.size;
791
+ lmlock->grow_to(lock_size);
792
+ }
793
+ break;
794
+ #if defined(GGML_USE_CUBLAS)
795
+ case GGML_BACKEND_GPU:
796
+ case GGML_BACKEND_GPU_SPLIT:
797
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
798
+ if (!use_mmap) {
799
+ free(lt.data);
800
+ }
801
+ break;
802
+ #elif defined(GGML_USE_CLBLAST)
803
+ case GGML_BACKEND_GPU:
804
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
805
+ if (!use_mmap) {
806
+ free(lt.data);
807
+ }
808
+ break;
809
+ #endif
810
+ default:
811
+ continue;
768
812
  }
813
+
814
+ done_size += lt.size;
769
815
  }
770
816
  }
771
817
 
@@ -836,7 +882,8 @@ static bool kv_cache_init(
836
882
  const struct llama_hparams & hparams,
837
883
  struct llama_kv_cache & cache,
838
884
  ggml_type wtype,
839
- int n_ctx) {
885
+ int n_ctx,
886
+ int n_gpu_layers) {
840
887
  const int n_embd = hparams.n_embd;
841
888
  const int n_layer = hparams.n_layer;
842
889
 
@@ -844,6 +891,7 @@ static bool kv_cache_init(
844
891
  const int64_t n_elements = n_embd*n_mem;
845
892
 
846
893
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
894
+ cache.n = 0;
847
895
 
848
896
  struct ggml_init_params params;
849
897
  params.mem_size = cache.buf.size;
@@ -862,25 +910,36 @@ static bool kv_cache_init(
862
910
  ggml_set_name(cache.k, "cache_k");
863
911
  ggml_set_name(cache.v, "cache_v");
864
912
 
913
+ (void) n_gpu_layers;
914
+ #ifdef GGML_USE_CUBLAS
915
+ if (n_gpu_layers > n_layer + 1) {
916
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
917
+ }
918
+ if (n_gpu_layers > n_layer + 2) {
919
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
920
+ }
921
+ #endif // GGML_USE_CUBLAS
922
+
865
923
  return true;
866
924
  }
867
925
 
868
926
  struct llama_context_params llama_context_default_params() {
869
927
  struct llama_context_params result = {
928
+ /*.seed =*/ -1,
870
929
  /*.n_ctx =*/ 512,
871
930
  /*.n_batch =*/ 512,
872
931
  /*.gpu_layers =*/ 0,
873
932
  /*.main_gpu =*/ 0,
874
933
  /*.tensor_split =*/ {0},
875
- /*.seed =*/ -1,
934
+ /*.progress_callback =*/ nullptr,
935
+ /*.progress_callback_user_data =*/ nullptr,
936
+ /*.low_vram =*/ false,
876
937
  /*.f16_kv =*/ true,
877
938
  /*.logits_all =*/ false,
878
939
  /*.vocab_only =*/ false,
879
940
  /*.use_mmap =*/ true,
880
941
  /*.use_mlock =*/ false,
881
942
  /*.embedding =*/ false,
882
- /*.progress_callback =*/ nullptr,
883
- /*.progress_callback_user_data =*/ nullptr,
884
943
  };
885
944
 
886
945
  return result;
@@ -980,6 +1039,7 @@ static void llama_model_load_internal(
980
1039
  int n_gpu_layers,
981
1040
  int main_gpu,
982
1041
  const float * tensor_split,
1042
+ bool low_vram,
983
1043
  ggml_type memory_type,
984
1044
  bool use_mmap,
985
1045
  bool use_mlock,
@@ -1005,6 +1065,12 @@ static void llama_model_load_internal(
1005
1065
  case 40: model.type = e_model::MODEL_13B; break;
1006
1066
  case 60: model.type = e_model::MODEL_30B; break;
1007
1067
  case 80: model.type = e_model::MODEL_65B; break;
1068
+ default:
1069
+ {
1070
+ if (hparams.n_layer < 32) {
1071
+ model.type = e_model::MODEL_7B;
1072
+ }
1073
+ } break;
1008
1074
  }
1009
1075
 
1010
1076
  hparams.n_ctx = n_ctx;
@@ -1100,18 +1166,34 @@ static void llama_model_load_internal(
1100
1166
  ml->ggml_ctx = ctx;
1101
1167
 
1102
1168
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1103
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1104
1169
 
1105
1170
  // "output" tensor
1106
1171
  {
1172
+ ggml_backend backend_norm;
1107
1173
  ggml_backend backend_output;
1108
1174
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1175
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1176
+ // on Windows however this is detrimental unless everything is on the GPU
1177
+ #ifndef _WIN32
1178
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1179
+ #else
1180
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1181
+ #endif // _WIN32
1182
+
1109
1183
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1110
1184
  } else {
1185
+ backend_norm = GGML_BACKEND_CPU;
1111
1186
  backend_output = GGML_BACKEND_CPU;
1112
1187
  }
1113
1188
 
1189
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1114
1190
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1191
+ if (backend_norm == GGML_BACKEND_GPU) {
1192
+ vram_weights += ggml_nbytes(model.norm);
1193
+ }
1194
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1195
+ vram_weights += ggml_nbytes(model.output);
1196
+ }
1115
1197
  }
1116
1198
 
1117
1199
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1141,7 +1223,7 @@ static void llama_model_load_internal(
1141
1223
  if (backend == GGML_BACKEND_GPU) {
1142
1224
  vram_weights +=
1143
1225
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1144
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1226
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1145
1227
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1146
1228
  }
1147
1229
  }
@@ -1169,23 +1251,49 @@ static void llama_model_load_internal(
1169
1251
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1170
1252
 
1171
1253
  (void) vram_scratch;
1254
+ (void) n_batch;
1172
1255
  #ifdef GGML_USE_CUBLAS
1173
- vram_scratch = n_batch * MB;
1174
- ggml_cuda_set_scratch_size(vram_scratch);
1175
- if (n_gpu_layers > 0) {
1176
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
- __func__, vram_scratch / MB);
1256
+ if (low_vram) {
1257
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1258
+ ggml_cuda_set_scratch_size(0); // disable scratch
1259
+ } else {
1260
+ vram_scratch = n_batch * MB;
1261
+ ggml_cuda_set_scratch_size(vram_scratch);
1262
+ if (n_gpu_layers > 0) {
1263
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1264
+ __func__, vram_scratch / MB);
1265
+ }
1178
1266
  }
1179
1267
  #endif // GGML_USE_CUBLAS
1180
1268
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1181
1269
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1182
1270
 
1183
- fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1271
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1184
1272
  if (n_gpu_layers > (int) hparams.n_layer) {
1185
- fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1273
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1274
+ }
1275
+ size_t vram_kv_cache = 0;
1276
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1277
+ if (low_vram) {
1278
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1279
+ } else {
1280
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1281
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1282
+ }
1283
+ }
1284
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1285
+ if (low_vram) {
1286
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1287
+ } else {
1288
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1289
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1290
+ }
1186
1291
  }
1292
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1293
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1294
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1187
1295
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
- __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1296
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1189
1297
  #else
1190
1298
  (void) n_gpu_layers;
1191
1299
  #endif
@@ -1196,58 +1304,15 @@ static void llama_model_load_internal(
1196
1304
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1197
1305
  }
1198
1306
 
1199
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1200
-
1307
+ (void) tensor_split;
1201
1308
  #if defined(GGML_USE_CUBLAS)
1202
1309
  {
1203
1310
  ggml_cuda_set_tensor_split(tensor_split);
1204
-
1205
- size_t done_size = 0;
1206
- size_t data_size = 0;
1207
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1208
- data_size += lt.size;
1209
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1210
- done_size += lt.size;
1211
- }
1212
- }
1213
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1214
- ggml_backend backend = lt.ggml_tensor->backend;
1215
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216
- continue;
1217
- }
1218
- if (progress_callback) {
1219
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1220
- }
1221
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1222
- done_size += lt.size;
1223
- }
1224
- }
1225
- #elif defined(GGML_USE_CLBLAST)
1226
- {
1227
- size_t done_size = 0;
1228
- size_t data_size = 0;
1229
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
- data_size += lt.size;
1231
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
- done_size += lt.size;
1233
- }
1234
- }
1235
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
- if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
- continue;
1238
- }
1239
- if (progress_callback) {
1240
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
- }
1242
- ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
- done_size += lt.size;
1244
- }
1245
1311
  }
1246
- #else
1247
- (void) n_batch;
1248
- (void) tensor_split;
1249
1312
  #endif
1250
1313
 
1314
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1315
+
1251
1316
  if (progress_callback) {
1252
1317
  progress_callback(1.0f, progress_callback_user_data);
1253
1318
  }
@@ -1267,6 +1332,7 @@ static bool llama_model_load(
1267
1332
  int n_gpu_layers,
1268
1333
  int main_gpu,
1269
1334
  float * tensor_split,
1335
+ bool low_vram,
1270
1336
  ggml_type memory_type,
1271
1337
  bool use_mmap,
1272
1338
  bool use_mlock,
@@ -1274,7 +1340,7 @@ static bool llama_model_load(
1274
1340
  llama_progress_callback progress_callback,
1275
1341
  void *progress_callback_user_data) {
1276
1342
  try {
1277
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1343
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1278
1344
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1279
1345
  return true;
1280
1346
  } catch (const std::exception & err) {
@@ -1350,12 +1416,33 @@ static bool llama_eval_internal(
1350
1416
  const int i_gpu_start = n_layer - n_gpu_layers;
1351
1417
  (void) i_gpu_start;
1352
1418
 
1419
+ // offload functions set the tensor output backend to GPU
1420
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1421
+ //
1422
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1423
+ // in that case ggml_cuda_assign_buffers has no effect
1424
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1425
+ offload_func_t offload_func_kq = llama_nop;
1426
+ offload_func_t offload_func_v = llama_nop;
1427
+
1428
+ #ifdef GGML_USE_CUBLAS
1429
+ if (n_gpu_layers > n_layer) {
1430
+ offload_func_nr = ggml_cuda_assign_buffers;
1431
+ }
1432
+ if (n_gpu_layers > n_layer + 1) {
1433
+ offload_func_v = ggml_cuda_assign_buffers;
1434
+ }
1435
+ if (n_gpu_layers > n_layer + 2) {
1436
+ offload_func_kq = ggml_cuda_assign_buffers;
1437
+ }
1438
+ #endif // GGML_USE_CUBLAS
1439
+
1353
1440
  for (int il = 0; il < n_layer; ++il) {
1354
1441
  offload_func_t offload_func = llama_nop;
1355
1442
 
1356
1443
  #ifdef GGML_USE_CUBLAS
1357
1444
  if (il >= i_gpu_start) {
1358
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1445
+ offload_func = ggml_cuda_assign_buffers;
1359
1446
  }
1360
1447
  #endif // GGML_USE_CUBLAS
1361
1448
 
@@ -1378,31 +1465,42 @@ static bool llama_eval_internal(
1378
1465
  // self-attention
1379
1466
  {
1380
1467
  // compute Q and K and RoPE them
1381
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
- // offload_func(tmpq);
1383
- ggml_set_name(tmpq, "tmpq");
1384
-
1385
1468
  struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
- // offload_func(tmpk);
1469
+ offload_func_kq(tmpk);
1387
1470
  ggml_set_name(tmpk, "tmpk");
1388
1471
 
1472
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1473
+ offload_func_kq(tmpq);
1474
+ ggml_set_name(tmpq, "tmpq");
1475
+
1389
1476
  struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1477
+ offload_func_kq(Kcur);
1390
1478
  ggml_set_name(Kcur, "Kcur");
1391
1479
 
1392
1480
  struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1481
+ offload_func_kq(Qcur);
1393
1482
  ggml_set_name(Qcur, "Qcur");
1394
1483
 
1395
1484
  // store key and value to memory
1396
1485
  {
1397
1486
  // compute the transposed [N, n_embd] V matrix
1398
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1487
+
1488
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1489
+ offload_func_v(tmpv);
1490
+ ggml_set_name(tmpv, "tmpv");
1491
+
1492
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1493
+ offload_func_v(Vcur);
1399
1494
  ggml_set_name(Vcur, "Vcur");
1400
1495
 
1401
1496
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1497
+ offload_func_kq(k);
1402
1498
  ggml_set_name(k, "k");
1499
+
1403
1500
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1404
1501
  ( n_ctx)*ggml_element_size(kv_self.v),
1405
1502
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1503
+ offload_func_v(v);
1406
1504
  ggml_set_name(v, "v");
1407
1505
 
1408
1506
  // important: storing RoPE-ed version of K in the KV cache!
@@ -1414,6 +1512,7 @@ static bool llama_eval_internal(
1414
1512
  ggml_permute(ctx0,
1415
1513
  Qcur,
1416
1514
  0, 2, 1, 3);
1515
+ offload_func_kq(Q);
1417
1516
  ggml_set_name(Q, "Q");
1418
1517
 
1419
1518
  struct ggml_tensor * K =
@@ -1422,10 +1521,12 @@ static bool llama_eval_internal(
1422
1521
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1423
1522
  n_embd/n_head, n_head, n_past + N),
1424
1523
  0, 2, 1, 3);
1524
+ offload_func_kq(K);
1425
1525
  ggml_set_name(K, "K");
1426
1526
 
1427
1527
  // K * Q
1428
1528
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1529
+ offload_func_kq(KQ);
1429
1530
  ggml_set_name(KQ, "KQ");
1430
1531
 
1431
1532
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1434,14 +1535,17 @@ static bool llama_eval_internal(
1434
1535
 
1435
1536
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1436
1537
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1538
+ offload_func_kq(KQ_scaled);
1437
1539
  ggml_set_name(KQ_scaled, "KQ_scaled");
1438
1540
 
1439
1541
  // KQ_masked = mask_past(KQ_scaled)
1440
1542
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1543
+ offload_func_kq(KQ_masked);
1441
1544
  ggml_set_name(KQ_masked, "KQ_masked");
1442
1545
 
1443
1546
  // KQ = soft_max(KQ_masked)
1444
1547
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1548
+ offload_func_v(KQ_soft_max);
1445
1549
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1446
1550
 
1447
1551
  // split cached V into n_head heads
@@ -1451,10 +1555,12 @@ static bool llama_eval_internal(
1451
1555
  n_ctx*ggml_element_size(kv_self.v),
1452
1556
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1453
1557
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1558
+ offload_func_v(V);
1454
1559
  ggml_set_name(V, "V");
1455
1560
 
1456
1561
  #if 1
1457
1562
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1563
+ offload_func_v(KQV);
1458
1564
  ggml_set_name(KQV, "KQV");
1459
1565
  #else
1460
1566
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1466,12 +1572,14 @@ static bool llama_eval_internal(
1466
1572
 
1467
1573
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1468
1574
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1575
+ offload_func_v(KQV_merged);
1469
1576
  ggml_set_name(KQV_merged, "KQV_merged");
1470
1577
 
1471
1578
  // cur = KQV_merged.contiguous().view(n_embd, N)
1472
1579
  cur = ggml_cpy(ctx0,
1473
1580
  KQV_merged,
1474
1581
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1582
+ offload_func_v(cur);
1475
1583
  ggml_set_name(cur, "KQV_merged_contiguous");
1476
1584
 
1477
1585
  // projection (no bias)
@@ -1483,7 +1591,6 @@ static bool llama_eval_internal(
1483
1591
  }
1484
1592
 
1485
1593
  lctx.use_buf(ctx0, 1);
1486
- //ggml_cuda_set_scratch(1);
1487
1594
 
1488
1595
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
1596
  offload_func(inpFF);
@@ -1513,7 +1620,7 @@ static bool llama_eval_internal(
1513
1620
  model.layers[il].w1,
1514
1621
  cur);
1515
1622
  offload_func(cur);
1516
- ggml_set_name(cur, "result_w2");
1623
+ ggml_set_name(cur, "result_w1");
1517
1624
 
1518
1625
  // SILU activation
1519
1626
  cur = ggml_silu(ctx0, cur);
@@ -1541,32 +1648,20 @@ static bool llama_eval_internal(
1541
1648
  }
1542
1649
 
1543
1650
  lctx.use_buf(ctx0, 0);
1544
- //ggml_cuda_set_scratch(0);
1545
1651
 
1546
1652
  // used at the end to optionally extract the embeddings
1547
1653
  struct ggml_tensor * embeddings = NULL;
1548
1654
 
1549
- offload_func_t offload_func = llama_nop;
1550
-
1551
- #ifdef GGML_USE_CUBLAS
1552
- if (n_gpu_layers > n_layer) {
1553
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
- }
1555
- #endif // GGML_USE_CUBLAS
1556
1655
 
1557
1656
  // norm
1558
1657
  {
1559
1658
  cur = ggml_rms_norm(ctx0, inpL);
1560
- offload_func(cur);
1561
- ggml_set_name(cur, "rms_norm_inpL");
1562
-
1563
- cur = ggml_rms_norm(ctx0, cur);
1564
- offload_func(cur);
1565
- ggml_set_name(cur, "rms_norm_after");
1659
+ offload_func_nr(cur);
1660
+ ggml_set_name(cur, "rms_norm_2");
1566
1661
 
1567
1662
  // cur = cur*norm(broadcasted)
1568
1663
  cur = ggml_mul(ctx0, cur, model.norm);
1569
- offload_func(cur);
1664
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1570
1665
  ggml_set_name(cur, "result_norm");
1571
1666
 
1572
1667
  embeddings = cur;
@@ -2174,6 +2269,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2174
2269
  return -log2f(candidate.p) > *mu;
2175
2270
  }));
2176
2271
 
2272
+ if (candidates->size == 0) {
2273
+ candidates->size = 1;
2274
+ }
2275
+
2177
2276
  // Normalize the probabilities of the remaining words
2178
2277
  llama_sample_softmax(ctx, candidates);
2179
2278
 
@@ -2311,7 +2410,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2311
2410
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2312
2411
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2313
2412
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2413
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2414
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2314
2415
 
2416
+ #ifdef GGML_USE_K_QUANTS
2315
2417
  // K-quants
2316
2418
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
2419
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -2322,6 +2424,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2322
2424
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
2425
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
2426
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2427
+ #endif
2325
2428
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
2429
  }
2327
2430
 
@@ -2333,6 +2436,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2333
2436
  /*vocab_only*/ false));
2334
2437
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
2438
 
2439
+ #ifdef GGML_USE_K_QUANTS
2336
2440
  int n_attention_wv = 0;
2337
2441
  int n_feed_forward_w2 = 0;
2338
2442
  for (auto& tensor : model_loader->tensors_map.tensors) {
@@ -2346,6 +2450,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2346
2450
 
2347
2451
  int i_attention_wv = 0;
2348
2452
  int i_feed_forward_w2 = 0;
2453
+ #endif
2349
2454
 
2350
2455
  size_t total_size_org = 0;
2351
2456
  size_t total_size_new = 0;
@@ -2371,12 +2476,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2371
2476
 
2372
2477
  // quantize only 2D tensors
2373
2478
  quantize &= (tensor.ne.size() == 2);
2374
-
2375
- // uncomment this to keep the output layer in FP16
2376
- if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
- quantize = false;
2378
- }
2379
- quantize = quantize && quantized_type != tensor.type;
2479
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
2480
+ quantize &= quantized_type != tensor.type;
2380
2481
 
2381
2482
  enum ggml_type new_type;
2382
2483
  void * new_data;
@@ -2390,31 +2491,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2390
2491
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2391
2492
  } else {
2392
2493
  new_type = quantized_type;
2393
- // TODO: temporary disabled until Metal / OpenCL support is available
2394
- // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
- //if (tensor.name == "output.weight") {
2396
- // new_type = GGML_TYPE_Q6_K;
2397
- //}
2398
- if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2494
+ #ifdef GGML_USE_K_QUANTS
2495
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2496
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2497
+ int nx = tensor.ne.at(0);
2498
+ int ny = tensor.ne.at(1);
2499
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2500
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2501
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2502
+ fprintf(stderr, "========================================================================================\n\n");
2503
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2504
+ }
2505
+ }
2506
+ if (tensor.name == "output.weight") {
2507
+ int nx = tensor.ne.at(0);
2508
+ int ny = tensor.ne.at(1);
2509
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2510
+ new_type = GGML_TYPE_Q6_K;
2511
+ }
2512
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
2513
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
2514
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
2515
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
2516
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
2517
  (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
2518
  ++i_attention_wv;
2405
- }
2406
- if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2519
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
2520
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
2521
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
2522
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
2523
  (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
2524
  (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
2525
  ++i_feed_forward_w2;
2413
- }
2414
- if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2526
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
2527
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
2528
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
2529
  }
2530
+ #endif
2418
2531
 
2419
2532
  float * f32_data;
2420
2533
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
@@ -2554,8 +2667,8 @@ struct llama_context * llama_init_from_file(
2554
2667
 
2555
2668
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2556
2669
 
2557
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
- params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2670
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2671
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2559
2672
  params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2560
2673
  fprintf(stderr, "%s: failed to load model\n", __func__);
2561
2674
  llama_free(ctx);
@@ -2564,7 +2677,7 @@ struct llama_context * llama_init_from_file(
2564
2677
 
2565
2678
  // reserve memory for context buffers
2566
2679
  if (!params.vocab_only) {
2567
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2680
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2568
2681
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2569
2682
  llama_free(ctx);
2570
2683
  return nullptr;
@@ -2599,16 +2712,21 @@ struct llama_context * llama_init_from_file(
2599
2712
  // this allocates all Metal resources and memory buffers
2600
2713
  ctx->ctx_metal = ggml_metal_init();
2601
2714
 
2602
- void *data_ptr = NULL;
2715
+ void * data_ptr = NULL;
2603
2716
  size_t data_size = 0;
2717
+
2604
2718
  if (params.use_mmap) {
2605
- data_ptr = ctx->model.mapping->addr;
2606
- data_size= ctx->model.mapping->size;
2719
+ data_ptr = ctx->model.mapping->addr;
2720
+ data_size = ctx->model.mapping->size;
2607
2721
  } else {
2608
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2609
- data_size= ggml_get_mem_size(ctx->model.ctx);
2722
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2723
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2610
2724
  }
2611
2725
 
2726
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2727
+
2728
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2729
+
2612
2730
  #define LLAMA_METAL_CHECK_BUF(result) \
2613
2731
  if (!(result)) { \
2614
2732
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2616,12 +2734,13 @@ struct llama_context * llama_init_from_file(
2616
2734
  return NULL; \
2617
2735
  }
2618
2736
 
2619
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2620
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2737
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2621
2738
 
2622
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2623
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2624
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2739
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2741
+
2742
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
2625
2744
  #undef LLAMA_METAL_CHECK_BUF
2626
2745
  }
2627
2746
  #endif
@@ -3007,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3007
3126
  if (kv_size) {
3008
3127
  const size_t elt_size = ggml_element_size(kv_self.k);
3009
3128
 
3010
- char buffer[4096];
3011
-
3012
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3129
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3013
3130
  ggml_cgraph gf{};
3014
3131
  gf.n_threads = 1;
3015
3132
 
@@ -3115,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3115
3232
 
3116
3233
  const size_t elt_size = ggml_element_size(kv_self.k);
3117
3234
 
3118
- char buffer[4096];
3119
-
3120
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3235
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3121
3236
  ggml_cgraph gf{};
3122
3237
  gf.n_threads = 1;
3123
3238
 
@@ -3301,6 +3416,19 @@ int llama_n_embd(const struct llama_context * ctx) {
3301
3416
  return ctx->model.hparams.n_embd;
3302
3417
  }
3303
3418
 
3419
+ int llama_get_vocab(
3420
+ const struct llama_context * ctx,
3421
+ const char * * strings,
3422
+ float * scores,
3423
+ int capacity) {
3424
+ int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3425
+ for (int i = 0; i<n; ++i) {
3426
+ strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3427
+ scores[i] = ctx->vocab.id_to_token[i].score;
3428
+ }
3429
+ return n;
3430
+ }
3431
+
3304
3432
  float * llama_get_logits(struct llama_context * ctx) {
3305
3433
  return ctx->logits.data();
3306
3434
  }
@@ -3339,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
3339
3467
 
3340
3468
  fprintf(stderr, "\n");
3341
3469
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3342
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3343
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3344
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
3470
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3471
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3472
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3473
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3474
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3475
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3345
3476
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3346
3477
  }
3347
3478