llama_cpp 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,11 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #define QK_K 256
25
+ #endif
26
+ #endif
22
27
 
23
28
  #include <array>
24
29
  #include <ctime>
@@ -40,6 +45,10 @@
40
45
  #include <sstream>
41
46
  #include <numeric>
42
47
 
48
+ #if defined(_MSC_VER)
49
+ #pragma warning(disable: 4244 4267) // possible loss of data
50
+ #endif
51
+
43
52
  #define LLAMA_USE_SCRATCH
44
53
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
54
 
@@ -165,6 +174,11 @@ struct llama_kv_cache {
165
174
  if (ctx) {
166
175
  ggml_free(ctx);
167
176
  }
177
+
178
+ #ifdef GGML_USE_CUBLAS
179
+ ggml_cuda_free_data(k);
180
+ ggml_cuda_free_data(v);
181
+ #endif // GGML_USE_CUBLAS
168
182
  }
169
183
  };
170
184
 
@@ -210,6 +224,7 @@ struct llama_model {
210
224
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
225
  ggml_cuda_free_data(tensors_by_name[i].second);
212
226
  }
227
+ ggml_cuda_free_scratch();
213
228
  #elif defined(GGML_USE_CLBLAST)
214
229
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
230
  ggml_cl_free_data(tensors_by_name[i].second);
@@ -707,6 +722,9 @@ struct llama_model_loader {
707
722
 
708
723
  struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
709
724
  struct ggml_tensor * tensor;
725
+ if (backend != GGML_BACKEND_CPU) {
726
+ ggml_set_no_alloc(ggml_ctx, true);
727
+ }
710
728
  if (lt.ne.size() == 2) {
711
729
  tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
712
730
  } else {
@@ -716,6 +734,9 @@ struct llama_model_loader {
716
734
  ggml_set_name(tensor, lt.name.c_str());
717
735
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
718
736
 
737
+ if (backend != GGML_BACKEND_CPU) {
738
+ ggml_set_no_alloc(ggml_ctx, use_mmap);
739
+ }
719
740
  tensor->backend = backend;
720
741
  lt.ggml_tensor = tensor;
721
742
  num_ggml_tensors_created++;
@@ -731,6 +752,7 @@ struct llama_model_loader {
731
752
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732
753
  size_t data_size = 0;
733
754
  size_t prefetch_size = 0;
755
+ size_t lock_size = 0;
734
756
  for (const llama_load_tensor & lt : tensors_map.tensors) {
735
757
  data_size += lt.size;
736
758
  if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -740,11 +762,6 @@ struct llama_model_loader {
740
762
 
741
763
  if (use_mmap) {
742
764
  mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
743
- if (!lmlock) {
744
- // Don't call the callback since the actual loading will be lazy
745
- // and we can't measure it.
746
- progress_callback = NULL;
747
- }
748
765
  if (lmlock) {
749
766
  lmlock->init(mapping->addr);
750
767
  }
@@ -752,20 +769,49 @@ struct llama_model_loader {
752
769
 
753
770
  size_t done_size = 0;
754
771
  for (llama_load_tensor & lt : tensors_map.tensors) {
755
- if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
756
- continue;
757
- }
758
772
  if (progress_callback) {
759
773
  progress_callback((float) done_size / data_size, progress_callback_user_data);
760
774
  }
761
775
  LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
762
776
  lt.data = (uint8_t *) lt.ggml_tensor->data;
777
+
778
+ // allocate temp buffer if not using mmap
779
+ if (!use_mmap && lt.data == NULL) {
780
+ GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
781
+ lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
782
+ }
783
+
763
784
  load_data_for(lt);
764
- lt.ggml_tensor->data = lt.data;
765
- done_size += lt.size;
766
- if (use_mmap && lmlock) {
767
- lmlock->grow_to(done_size);
785
+
786
+ switch(lt.ggml_tensor->backend) {
787
+ case GGML_BACKEND_CPU:
788
+ lt.ggml_tensor->data = lt.data;
789
+ if (use_mmap && lmlock) {
790
+ lock_size += lt.size;
791
+ lmlock->grow_to(lock_size);
792
+ }
793
+ break;
794
+ #if defined(GGML_USE_CUBLAS)
795
+ case GGML_BACKEND_GPU:
796
+ case GGML_BACKEND_GPU_SPLIT:
797
+ ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
798
+ if (!use_mmap) {
799
+ free(lt.data);
800
+ }
801
+ break;
802
+ #elif defined(GGML_USE_CLBLAST)
803
+ case GGML_BACKEND_GPU:
804
+ ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
805
+ if (!use_mmap) {
806
+ free(lt.data);
807
+ }
808
+ break;
809
+ #endif
810
+ default:
811
+ continue;
768
812
  }
813
+
814
+ done_size += lt.size;
769
815
  }
770
816
  }
771
817
 
@@ -836,7 +882,8 @@ static bool kv_cache_init(
836
882
  const struct llama_hparams & hparams,
837
883
  struct llama_kv_cache & cache,
838
884
  ggml_type wtype,
839
- int n_ctx) {
885
+ int n_ctx,
886
+ int n_gpu_layers) {
840
887
  const int n_embd = hparams.n_embd;
841
888
  const int n_layer = hparams.n_layer;
842
889
 
@@ -844,6 +891,7 @@ static bool kv_cache_init(
844
891
  const int64_t n_elements = n_embd*n_mem;
845
892
 
846
893
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
894
+ cache.n = 0;
847
895
 
848
896
  struct ggml_init_params params;
849
897
  params.mem_size = cache.buf.size;
@@ -862,25 +910,36 @@ static bool kv_cache_init(
862
910
  ggml_set_name(cache.k, "cache_k");
863
911
  ggml_set_name(cache.v, "cache_v");
864
912
 
913
+ (void) n_gpu_layers;
914
+ #ifdef GGML_USE_CUBLAS
915
+ if (n_gpu_layers > n_layer + 1) {
916
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
917
+ }
918
+ if (n_gpu_layers > n_layer + 2) {
919
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
920
+ }
921
+ #endif // GGML_USE_CUBLAS
922
+
865
923
  return true;
866
924
  }
867
925
 
868
926
  struct llama_context_params llama_context_default_params() {
869
927
  struct llama_context_params result = {
928
+ /*.seed =*/ -1,
870
929
  /*.n_ctx =*/ 512,
871
930
  /*.n_batch =*/ 512,
872
931
  /*.gpu_layers =*/ 0,
873
932
  /*.main_gpu =*/ 0,
874
933
  /*.tensor_split =*/ {0},
875
- /*.seed =*/ -1,
934
+ /*.progress_callback =*/ nullptr,
935
+ /*.progress_callback_user_data =*/ nullptr,
936
+ /*.low_vram =*/ false,
876
937
  /*.f16_kv =*/ true,
877
938
  /*.logits_all =*/ false,
878
939
  /*.vocab_only =*/ false,
879
940
  /*.use_mmap =*/ true,
880
941
  /*.use_mlock =*/ false,
881
942
  /*.embedding =*/ false,
882
- /*.progress_callback =*/ nullptr,
883
- /*.progress_callback_user_data =*/ nullptr,
884
943
  };
885
944
 
886
945
  return result;
@@ -980,6 +1039,7 @@ static void llama_model_load_internal(
980
1039
  int n_gpu_layers,
981
1040
  int main_gpu,
982
1041
  const float * tensor_split,
1042
+ bool low_vram,
983
1043
  ggml_type memory_type,
984
1044
  bool use_mmap,
985
1045
  bool use_mlock,
@@ -1005,6 +1065,12 @@ static void llama_model_load_internal(
1005
1065
  case 40: model.type = e_model::MODEL_13B; break;
1006
1066
  case 60: model.type = e_model::MODEL_30B; break;
1007
1067
  case 80: model.type = e_model::MODEL_65B; break;
1068
+ default:
1069
+ {
1070
+ if (hparams.n_layer < 32) {
1071
+ model.type = e_model::MODEL_7B;
1072
+ }
1073
+ } break;
1008
1074
  }
1009
1075
 
1010
1076
  hparams.n_ctx = n_ctx;
@@ -1100,18 +1166,34 @@ static void llama_model_load_internal(
1100
1166
  ml->ggml_ctx = ctx;
1101
1167
 
1102
1168
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1103
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1104
1169
 
1105
1170
  // "output" tensor
1106
1171
  {
1172
+ ggml_backend backend_norm;
1107
1173
  ggml_backend backend_output;
1108
1174
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1175
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1176
+ // on Windows however this is detrimental unless everything is on the GPU
1177
+ #ifndef _WIN32
1178
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1179
+ #else
1180
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1181
+ #endif // _WIN32
1182
+
1109
1183
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1110
1184
  } else {
1185
+ backend_norm = GGML_BACKEND_CPU;
1111
1186
  backend_output = GGML_BACKEND_CPU;
1112
1187
  }
1113
1188
 
1189
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1114
1190
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1191
+ if (backend_norm == GGML_BACKEND_GPU) {
1192
+ vram_weights += ggml_nbytes(model.norm);
1193
+ }
1194
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1195
+ vram_weights += ggml_nbytes(model.output);
1196
+ }
1115
1197
  }
1116
1198
 
1117
1199
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1141,7 +1223,7 @@ static void llama_model_load_internal(
1141
1223
  if (backend == GGML_BACKEND_GPU) {
1142
1224
  vram_weights +=
1143
1225
  ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1144
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
1226
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1145
1227
  ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1146
1228
  }
1147
1229
  }
@@ -1169,23 +1251,49 @@ static void llama_model_load_internal(
1169
1251
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1170
1252
 
1171
1253
  (void) vram_scratch;
1254
+ (void) n_batch;
1172
1255
  #ifdef GGML_USE_CUBLAS
1173
- vram_scratch = n_batch * MB;
1174
- ggml_cuda_set_scratch_size(vram_scratch);
1175
- if (n_gpu_layers > 0) {
1176
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1177
- __func__, vram_scratch / MB);
1256
+ if (low_vram) {
1257
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1258
+ ggml_cuda_set_scratch_size(0); // disable scratch
1259
+ } else {
1260
+ vram_scratch = n_batch * MB;
1261
+ ggml_cuda_set_scratch_size(vram_scratch);
1262
+ if (n_gpu_layers > 0) {
1263
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1264
+ __func__, vram_scratch / MB);
1265
+ }
1178
1266
  }
1179
1267
  #endif // GGML_USE_CUBLAS
1180
1268
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1181
1269
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1182
1270
 
1183
- fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1271
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1184
1272
  if (n_gpu_layers > (int) hparams.n_layer) {
1185
- fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
1273
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1274
+ }
1275
+ size_t vram_kv_cache = 0;
1276
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1277
+ if (low_vram) {
1278
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1279
+ } else {
1280
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1281
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1282
+ }
1283
+ }
1284
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1285
+ if (low_vram) {
1286
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1287
+ } else {
1288
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1289
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1290
+ }
1186
1291
  }
1292
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1293
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1294
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1187
1295
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1188
- __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1296
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1189
1297
  #else
1190
1298
  (void) n_gpu_layers;
1191
1299
  #endif
@@ -1196,58 +1304,15 @@ static void llama_model_load_internal(
1196
1304
  model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1197
1305
  }
1198
1306
 
1199
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1200
-
1307
+ (void) tensor_split;
1201
1308
  #if defined(GGML_USE_CUBLAS)
1202
1309
  {
1203
1310
  ggml_cuda_set_tensor_split(tensor_split);
1204
-
1205
- size_t done_size = 0;
1206
- size_t data_size = 0;
1207
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1208
- data_size += lt.size;
1209
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1210
- done_size += lt.size;
1211
- }
1212
- }
1213
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1214
- ggml_backend backend = lt.ggml_tensor->backend;
1215
- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216
- continue;
1217
- }
1218
- if (progress_callback) {
1219
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1220
- }
1221
- ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1222
- done_size += lt.size;
1223
- }
1224
- }
1225
- #elif defined(GGML_USE_CLBLAST)
1226
- {
1227
- size_t done_size = 0;
1228
- size_t data_size = 0;
1229
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1230
- data_size += lt.size;
1231
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1232
- done_size += lt.size;
1233
- }
1234
- }
1235
- for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1236
- if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
1237
- continue;
1238
- }
1239
- if (progress_callback) {
1240
- progress_callback((float) done_size / data_size, progress_callback_user_data);
1241
- }
1242
- ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1243
- done_size += lt.size;
1244
- }
1245
1311
  }
1246
- #else
1247
- (void) n_batch;
1248
- (void) tensor_split;
1249
1312
  #endif
1250
1313
 
1314
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1315
+
1251
1316
  if (progress_callback) {
1252
1317
  progress_callback(1.0f, progress_callback_user_data);
1253
1318
  }
@@ -1267,6 +1332,7 @@ static bool llama_model_load(
1267
1332
  int n_gpu_layers,
1268
1333
  int main_gpu,
1269
1334
  float * tensor_split,
1335
+ bool low_vram,
1270
1336
  ggml_type memory_type,
1271
1337
  bool use_mmap,
1272
1338
  bool use_mlock,
@@ -1274,7 +1340,7 @@ static bool llama_model_load(
1274
1340
  llama_progress_callback progress_callback,
1275
1341
  void *progress_callback_user_data) {
1276
1342
  try {
1277
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1343
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1278
1344
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1279
1345
  return true;
1280
1346
  } catch (const std::exception & err) {
@@ -1350,12 +1416,33 @@ static bool llama_eval_internal(
1350
1416
  const int i_gpu_start = n_layer - n_gpu_layers;
1351
1417
  (void) i_gpu_start;
1352
1418
 
1419
+ // offload functions set the tensor output backend to GPU
1420
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1421
+ //
1422
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1423
+ // in that case ggml_cuda_assign_buffers has no effect
1424
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1425
+ offload_func_t offload_func_kq = llama_nop;
1426
+ offload_func_t offload_func_v = llama_nop;
1427
+
1428
+ #ifdef GGML_USE_CUBLAS
1429
+ if (n_gpu_layers > n_layer) {
1430
+ offload_func_nr = ggml_cuda_assign_buffers;
1431
+ }
1432
+ if (n_gpu_layers > n_layer + 1) {
1433
+ offload_func_v = ggml_cuda_assign_buffers;
1434
+ }
1435
+ if (n_gpu_layers > n_layer + 2) {
1436
+ offload_func_kq = ggml_cuda_assign_buffers;
1437
+ }
1438
+ #endif // GGML_USE_CUBLAS
1439
+
1353
1440
  for (int il = 0; il < n_layer; ++il) {
1354
1441
  offload_func_t offload_func = llama_nop;
1355
1442
 
1356
1443
  #ifdef GGML_USE_CUBLAS
1357
1444
  if (il >= i_gpu_start) {
1358
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1445
+ offload_func = ggml_cuda_assign_buffers;
1359
1446
  }
1360
1447
  #endif // GGML_USE_CUBLAS
1361
1448
 
@@ -1378,31 +1465,42 @@ static bool llama_eval_internal(
1378
1465
  // self-attention
1379
1466
  {
1380
1467
  // compute Q and K and RoPE them
1381
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1382
- // offload_func(tmpq);
1383
- ggml_set_name(tmpq, "tmpq");
1384
-
1385
1468
  struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1386
- // offload_func(tmpk);
1469
+ offload_func_kq(tmpk);
1387
1470
  ggml_set_name(tmpk, "tmpk");
1388
1471
 
1472
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1473
+ offload_func_kq(tmpq);
1474
+ ggml_set_name(tmpq, "tmpq");
1475
+
1389
1476
  struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1477
+ offload_func_kq(Kcur);
1390
1478
  ggml_set_name(Kcur, "Kcur");
1391
1479
 
1392
1480
  struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1481
+ offload_func_kq(Qcur);
1393
1482
  ggml_set_name(Qcur, "Qcur");
1394
1483
 
1395
1484
  // store key and value to memory
1396
1485
  {
1397
1486
  // compute the transposed [N, n_embd] V matrix
1398
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
1487
+
1488
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1489
+ offload_func_v(tmpv);
1490
+ ggml_set_name(tmpv, "tmpv");
1491
+
1492
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1493
+ offload_func_v(Vcur);
1399
1494
  ggml_set_name(Vcur, "Vcur");
1400
1495
 
1401
1496
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1497
+ offload_func_kq(k);
1402
1498
  ggml_set_name(k, "k");
1499
+
1403
1500
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1404
1501
  ( n_ctx)*ggml_element_size(kv_self.v),
1405
1502
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1503
+ offload_func_v(v);
1406
1504
  ggml_set_name(v, "v");
1407
1505
 
1408
1506
  // important: storing RoPE-ed version of K in the KV cache!
@@ -1414,6 +1512,7 @@ static bool llama_eval_internal(
1414
1512
  ggml_permute(ctx0,
1415
1513
  Qcur,
1416
1514
  0, 2, 1, 3);
1515
+ offload_func_kq(Q);
1417
1516
  ggml_set_name(Q, "Q");
1418
1517
 
1419
1518
  struct ggml_tensor * K =
@@ -1422,10 +1521,12 @@ static bool llama_eval_internal(
1422
1521
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1423
1522
  n_embd/n_head, n_head, n_past + N),
1424
1523
  0, 2, 1, 3);
1524
+ offload_func_kq(K);
1425
1525
  ggml_set_name(K, "K");
1426
1526
 
1427
1527
  // K * Q
1428
1528
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1529
+ offload_func_kq(KQ);
1429
1530
  ggml_set_name(KQ, "KQ");
1430
1531
 
1431
1532
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1434,14 +1535,17 @@ static bool llama_eval_internal(
1434
1535
 
1435
1536
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1436
1537
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1538
+ offload_func_kq(KQ_scaled);
1437
1539
  ggml_set_name(KQ_scaled, "KQ_scaled");
1438
1540
 
1439
1541
  // KQ_masked = mask_past(KQ_scaled)
1440
1542
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1543
+ offload_func_kq(KQ_masked);
1441
1544
  ggml_set_name(KQ_masked, "KQ_masked");
1442
1545
 
1443
1546
  // KQ = soft_max(KQ_masked)
1444
1547
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1548
+ offload_func_v(KQ_soft_max);
1445
1549
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1446
1550
 
1447
1551
  // split cached V into n_head heads
@@ -1451,10 +1555,12 @@ static bool llama_eval_internal(
1451
1555
  n_ctx*ggml_element_size(kv_self.v),
1452
1556
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1453
1557
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1558
+ offload_func_v(V);
1454
1559
  ggml_set_name(V, "V");
1455
1560
 
1456
1561
  #if 1
1457
1562
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1563
+ offload_func_v(KQV);
1458
1564
  ggml_set_name(KQV, "KQV");
1459
1565
  #else
1460
1566
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1466,12 +1572,14 @@ static bool llama_eval_internal(
1466
1572
 
1467
1573
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1468
1574
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1575
+ offload_func_v(KQV_merged);
1469
1576
  ggml_set_name(KQV_merged, "KQV_merged");
1470
1577
 
1471
1578
  // cur = KQV_merged.contiguous().view(n_embd, N)
1472
1579
  cur = ggml_cpy(ctx0,
1473
1580
  KQV_merged,
1474
1581
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1582
+ offload_func_v(cur);
1475
1583
  ggml_set_name(cur, "KQV_merged_contiguous");
1476
1584
 
1477
1585
  // projection (no bias)
@@ -1483,7 +1591,6 @@ static bool llama_eval_internal(
1483
1591
  }
1484
1592
 
1485
1593
  lctx.use_buf(ctx0, 1);
1486
- //ggml_cuda_set_scratch(1);
1487
1594
 
1488
1595
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1489
1596
  offload_func(inpFF);
@@ -1513,7 +1620,7 @@ static bool llama_eval_internal(
1513
1620
  model.layers[il].w1,
1514
1621
  cur);
1515
1622
  offload_func(cur);
1516
- ggml_set_name(cur, "result_w2");
1623
+ ggml_set_name(cur, "result_w1");
1517
1624
 
1518
1625
  // SILU activation
1519
1626
  cur = ggml_silu(ctx0, cur);
@@ -1541,32 +1648,20 @@ static bool llama_eval_internal(
1541
1648
  }
1542
1649
 
1543
1650
  lctx.use_buf(ctx0, 0);
1544
- //ggml_cuda_set_scratch(0);
1545
1651
 
1546
1652
  // used at the end to optionally extract the embeddings
1547
1653
  struct ggml_tensor * embeddings = NULL;
1548
1654
 
1549
- offload_func_t offload_func = llama_nop;
1550
-
1551
- #ifdef GGML_USE_CUBLAS
1552
- if (n_gpu_layers > n_layer) {
1553
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1554
- }
1555
- #endif // GGML_USE_CUBLAS
1556
1655
 
1557
1656
  // norm
1558
1657
  {
1559
1658
  cur = ggml_rms_norm(ctx0, inpL);
1560
- offload_func(cur);
1561
- ggml_set_name(cur, "rms_norm_inpL");
1562
-
1563
- cur = ggml_rms_norm(ctx0, cur);
1564
- offload_func(cur);
1565
- ggml_set_name(cur, "rms_norm_after");
1659
+ offload_func_nr(cur);
1660
+ ggml_set_name(cur, "rms_norm_2");
1566
1661
 
1567
1662
  // cur = cur*norm(broadcasted)
1568
1663
  cur = ggml_mul(ctx0, cur, model.norm);
1569
- offload_func(cur);
1664
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1570
1665
  ggml_set_name(cur, "result_norm");
1571
1666
 
1572
1667
  embeddings = cur;
@@ -2174,6 +2269,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2174
2269
  return -log2f(candidate.p) > *mu;
2175
2270
  }));
2176
2271
 
2272
+ if (candidates->size == 0) {
2273
+ candidates->size = 1;
2274
+ }
2275
+
2177
2276
  // Normalize the probabilities of the remaining words
2178
2277
  llama_sample_softmax(ctx, candidates);
2179
2278
 
@@ -2311,7 +2410,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2311
2410
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
2312
2411
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
2313
2412
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2413
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2414
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2314
2415
 
2416
+ #ifdef GGML_USE_K_QUANTS
2315
2417
  // K-quants
2316
2418
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
2317
2419
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -2322,6 +2424,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2322
2424
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2323
2425
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
2324
2426
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
2427
+ #endif
2325
2428
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
2326
2429
  }
2327
2430
 
@@ -2333,6 +2436,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2333
2436
  /*vocab_only*/ false));
2334
2437
  llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2335
2438
 
2439
+ #ifdef GGML_USE_K_QUANTS
2336
2440
  int n_attention_wv = 0;
2337
2441
  int n_feed_forward_w2 = 0;
2338
2442
  for (auto& tensor : model_loader->tensors_map.tensors) {
@@ -2346,6 +2450,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2346
2450
 
2347
2451
  int i_attention_wv = 0;
2348
2452
  int i_feed_forward_w2 = 0;
2453
+ #endif
2349
2454
 
2350
2455
  size_t total_size_org = 0;
2351
2456
  size_t total_size_new = 0;
@@ -2371,12 +2476,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2371
2476
 
2372
2477
  // quantize only 2D tensors
2373
2478
  quantize &= (tensor.ne.size() == 2);
2374
-
2375
- // uncomment this to keep the output layer in FP16
2376
- if (!params->quantize_output_tensor && tensor.name == "output.weight") {
2377
- quantize = false;
2378
- }
2379
- quantize = quantize && quantized_type != tensor.type;
2479
+ quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
2480
+ quantize &= quantized_type != tensor.type;
2380
2481
 
2381
2482
  enum ggml_type new_type;
2382
2483
  void * new_data;
@@ -2390,31 +2491,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2390
2491
  printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2391
2492
  } else {
2392
2493
  new_type = quantized_type;
2393
- // TODO: temporary disabled until Metal / OpenCL support is available
2394
- // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2395
- //if (tensor.name == "output.weight") {
2396
- // new_type = GGML_TYPE_Q6_K;
2397
- //}
2398
- if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2494
+ #ifdef GGML_USE_K_QUANTS
2495
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2496
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2497
+ int nx = tensor.ne.at(0);
2498
+ int ny = tensor.ne.at(1);
2499
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2500
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2501
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2502
+ fprintf(stderr, "========================================================================================\n\n");
2503
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2504
+ }
2505
+ }
2506
+ if (tensor.name == "output.weight") {
2507
+ int nx = tensor.ne.at(0);
2508
+ int ny = tensor.ne.at(1);
2509
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2510
+ new_type = GGML_TYPE_Q6_K;
2511
+ }
2512
+ } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2399
2513
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2400
2514
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2401
2515
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2402
2516
  (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2403
2517
  (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2404
2518
  ++i_attention_wv;
2405
- }
2406
- if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2519
+ } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2407
2520
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2408
2521
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2409
2522
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2410
2523
  (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2411
2524
  (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2412
2525
  ++i_feed_forward_w2;
2413
- }
2414
- if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2526
+ } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2415
2527
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
2528
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
2529
  }
2530
+ #endif
2418
2531
 
2419
2532
  float * f32_data;
2420
2533
  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
@@ -2554,8 +2667,8 @@ struct llama_context * llama_init_from_file(
2554
2667
 
2555
2668
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2556
2669
 
2557
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2558
- params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2670
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2671
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2559
2672
  params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2560
2673
  fprintf(stderr, "%s: failed to load model\n", __func__);
2561
2674
  llama_free(ctx);
@@ -2564,7 +2677,7 @@ struct llama_context * llama_init_from_file(
2564
2677
 
2565
2678
  // reserve memory for context buffers
2566
2679
  if (!params.vocab_only) {
2567
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2680
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2568
2681
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2569
2682
  llama_free(ctx);
2570
2683
  return nullptr;
@@ -2599,16 +2712,21 @@ struct llama_context * llama_init_from_file(
2599
2712
  // this allocates all Metal resources and memory buffers
2600
2713
  ctx->ctx_metal = ggml_metal_init();
2601
2714
 
2602
- void *data_ptr = NULL;
2715
+ void * data_ptr = NULL;
2603
2716
  size_t data_size = 0;
2717
+
2604
2718
  if (params.use_mmap) {
2605
- data_ptr = ctx->model.mapping->addr;
2606
- data_size= ctx->model.mapping->size;
2719
+ data_ptr = ctx->model.mapping->addr;
2720
+ data_size = ctx->model.mapping->size;
2607
2721
  } else {
2608
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2609
- data_size= ggml_get_mem_size(ctx->model.ctx);
2722
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2723
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2610
2724
  }
2611
2725
 
2726
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2727
+
2728
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2729
+
2612
2730
  #define LLAMA_METAL_CHECK_BUF(result) \
2613
2731
  if (!(result)) { \
2614
2732
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2616,12 +2734,13 @@ struct llama_context * llama_init_from_file(
2616
2734
  return NULL; \
2617
2735
  }
2618
2736
 
2619
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2620
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2737
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2621
2738
 
2622
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2623
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2624
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2739
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2741
+
2742
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
2625
2744
  #undef LLAMA_METAL_CHECK_BUF
2626
2745
  }
2627
2746
  #endif
@@ -3007,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3007
3126
  if (kv_size) {
3008
3127
  const size_t elt_size = ggml_element_size(kv_self.k);
3009
3128
 
3010
- char buffer[4096];
3011
-
3012
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3129
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3013
3130
  ggml_cgraph gf{};
3014
3131
  gf.n_threads = 1;
3015
3132
 
@@ -3115,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3115
3232
 
3116
3233
  const size_t elt_size = ggml_element_size(kv_self.k);
3117
3234
 
3118
- char buffer[4096];
3119
-
3120
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3235
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3121
3236
  ggml_cgraph gf{};
3122
3237
  gf.n_threads = 1;
3123
3238
 
@@ -3301,6 +3416,19 @@ int llama_n_embd(const struct llama_context * ctx) {
3301
3416
  return ctx->model.hparams.n_embd;
3302
3417
  }
3303
3418
 
3419
+ int llama_get_vocab(
3420
+ const struct llama_context * ctx,
3421
+ const char * * strings,
3422
+ float * scores,
3423
+ int capacity) {
3424
+ int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3425
+ for (int i = 0; i<n; ++i) {
3426
+ strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3427
+ scores[i] = ctx->vocab.id_to_token[i].score;
3428
+ }
3429
+ return n;
3430
+ }
3431
+
3304
3432
  float * llama_get_logits(struct llama_context * ctx) {
3305
3433
  return ctx->logits.data();
3306
3434
  }
@@ -3339,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
3339
3467
 
3340
3468
  fprintf(stderr, "\n");
3341
3469
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3342
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3343
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3344
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
3470
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3471
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3472
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3473
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3474
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3475
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3345
3476
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3346
3477
  }
3347
3478