llama_cpp 0.3.5 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,8 +56,21 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
+ static void llama_log_internal(llama_log_level level, const char* format, ...);
60
+ static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
61
+ #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
62
+ #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
63
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
64
+
65
+
66
+ #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
67
+ #include "ggml-alloc.h"
68
+ #define LLAMA_USE_ALLOCATOR
69
+ #else
59
70
  #define LLAMA_USE_SCRATCH
60
71
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
72
+ #endif
73
+
61
74
 
62
75
  // available llama models
63
76
  enum e_model {
@@ -143,7 +156,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
143
156
  }
144
157
 
145
158
  // amount of VRAM needed per batch size to hold temporary results
146
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
159
+ // the values for 3b are not derived from testing but instead chosen conservatively
147
160
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
148
161
  {
149
162
  static std::map<e_model, size_t> k_sizes = {
@@ -151,14 +164,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
151
164
  { MODEL_7B, 512ull * kB },
152
165
  { MODEL_13B, 640ull * kB },
153
166
  { MODEL_30B, 768ull * kB },
154
- { MODEL_65B, 1536ull * kB },
155
- { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
167
+ { MODEL_65B, 1280ull * kB },
168
+ { MODEL_70B, 1280ull * kB },
156
169
  };
157
170
  return k_sizes;
158
171
  }
159
172
 
160
173
  // amount of VRAM needed per batch size and context to hold temporary results
161
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
174
+ // the values for 3b are not derived from testing but instead chosen conservatively
162
175
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
163
176
  {
164
177
  static std::map<e_model, size_t> k_sizes = {
@@ -166,8 +179,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
166
179
  { MODEL_7B, 128ull },
167
180
  { MODEL_13B, 160ull },
168
181
  { MODEL_30B, 208ull },
169
- { MODEL_65B, 416ull },
170
- { MODEL_70B, 416ull }, // TODO (likely can be reduced)
182
+ { MODEL_65B, 256ull },
183
+ { MODEL_70B, 256ull },
171
184
  };
172
185
  return k_sizes;
173
186
  }
@@ -327,13 +340,22 @@ struct llama_model {
327
340
 
328
341
  struct llama_context {
329
342
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
330
- #ifdef GGML_USE_METAL
331
343
  ~llama_context() {
344
+ if (model_owner) {
345
+ delete &model;
346
+ }
347
+ #ifdef GGML_USE_METAL
332
348
  if (ctx_metal) {
333
349
  ggml_metal_free(ctx_metal);
334
350
  }
335
- }
336
351
  #endif
352
+ #ifdef LLAMA_USE_ALLOCATOR
353
+ if (alloc) {
354
+ ggml_allocr_free(alloc);
355
+ }
356
+ #endif
357
+ }
358
+
337
359
  std::mt19937 rng;
338
360
 
339
361
  bool has_evaluated_once = false;
@@ -371,7 +393,17 @@ struct llama_context {
371
393
  // memory buffers used to evaluate the model
372
394
  // TODO: move in llama_state
373
395
  llama_ctx_buffer buf_compute;
396
+
397
+ #ifdef LLAMA_USE_ALLOCATOR
398
+ llama_ctx_buffer buf_alloc;
399
+ ggml_allocr * alloc = NULL;
400
+ #endif
401
+
402
+ #ifdef LLAMA_USE_SCRATCH
374
403
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
404
+ int buf_last = 0;
405
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
406
+ #endif
375
407
 
376
408
  #ifdef GGML_USE_METAL
377
409
  ggml_metal_context * ctx_metal = NULL;
@@ -381,9 +413,6 @@ struct llama_context {
381
413
  ggml_mpi_context * ctx_mpi = NULL;
382
414
  #endif
383
415
 
384
- int buf_last = 0;
385
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
386
-
387
416
  void use_buf(struct ggml_context * ctx, int i) {
388
417
  #if defined(LLAMA_USE_SCRATCH)
389
418
  size_t last_size = 0;
@@ -416,6 +445,14 @@ struct llama_context {
416
445
  }
417
446
  };
418
447
 
448
+ struct llama_state {
449
+ // We save the log callback globally
450
+ llama_log_callback log_callback = llama_log_callback_default;
451
+ void * log_callback_user_data = nullptr;
452
+ };
453
+ // global state
454
+ static llama_state g_state;
455
+
419
456
  template <typename T>
420
457
  static T checked_mul(T a, T b) {
421
458
  T ret = a * b;
@@ -482,7 +519,7 @@ struct llama_file_loader {
482
519
 
483
520
  llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
484
521
  : file(fname, "rb") {
485
- fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
522
+ LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
486
523
  read_magic();
487
524
  read_hparams();
488
525
  read_vocab();
@@ -597,7 +634,7 @@ struct llama_file_saver {
597
634
  llama_file_loader * any_file_loader;
598
635
  llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
599
636
  : file(fname, "wb"), any_file_loader(any_file_loader) {
600
- fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
637
+ LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
601
638
  write_magic();
602
639
  write_hparams(new_ftype);
603
640
  write_vocab();
@@ -618,7 +655,7 @@ struct llama_file_saver {
618
655
  }
619
656
  void write_vocab() {
620
657
  if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
621
- fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
658
+ LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
622
659
  }
623
660
  uint32_t n_vocab = any_file_loader->hparams.n_vocab;
624
661
  for (uint32_t i = 0; i < n_vocab; i++) {
@@ -725,12 +762,12 @@ struct llama_model_loader {
725
762
 
726
763
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
727
764
  size_t data_size = 0;
728
- size_t prefetch_size = 0;
765
+ size_t prefetch_size = file_loader->file.size;
729
766
  size_t lock_size = 0;
730
767
  for (const llama_load_tensor & lt : tensors_map.tensors) {
731
768
  data_size += lt.size;
732
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
733
- prefetch_size += lt.size;
769
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
770
+ prefetch_size -= lt.size;
734
771
  }
735
772
  }
736
773
 
@@ -809,7 +846,7 @@ struct llama_model_loader {
809
846
  uint8_t byte = lt.data[i];
810
847
  sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
811
848
  }
812
- fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
849
+ LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
813
850
  llama_format_tensor_shape(lt.ne).c_str(), lt.size);
814
851
  }
815
852
 
@@ -842,7 +879,7 @@ static bool kv_cache_init(
842
879
  cache.ctx = ggml_init(params);
843
880
 
844
881
  if (!cache.ctx) {
845
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
882
+ LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
846
883
  return false;
847
884
  }
848
885
 
@@ -879,6 +916,7 @@ struct llama_context_params llama_context_default_params() {
879
916
  /*.progress_callback =*/ nullptr,
880
917
  /*.progress_callback_user_data =*/ nullptr,
881
918
  /*.low_vram =*/ false,
919
+ /*.mul_mat_q =*/ false,
882
920
  /*.f16_kv =*/ true,
883
921
  /*.logits_all =*/ false,
884
922
  /*.vocab_only =*/ false,
@@ -1006,6 +1044,7 @@ static void llama_model_load_internal(
1006
1044
  int n_gpu_layers,
1007
1045
  int main_gpu,
1008
1046
  const float * tensor_split,
1047
+ const bool mul_mat_q,
1009
1048
  float rope_freq_base,
1010
1049
  float rope_freq_scale,
1011
1050
  bool low_vram,
@@ -1052,7 +1091,7 @@ static void llama_model_load_internal(
1052
1091
  LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1053
1092
  hparams.n_head_kv = hparams.n_head / n_gqa;
1054
1093
  if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1055
- fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1094
+ LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1056
1095
  model.type = e_model::MODEL_70B;
1057
1096
  hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1058
1097
  }
@@ -1068,22 +1107,22 @@ static void llama_model_load_internal(
1068
1107
  //const uint32_t n_ff = 28672;
1069
1108
 
1070
1109
  {
1071
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1072
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1073
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1074
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1075
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1076
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1077
- fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1078
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1079
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1080
- fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1081
- fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1082
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1083
- fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1084
- fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1085
- fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1086
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1110
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version));
1111
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1112
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1113
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1114
+ LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult);
1115
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1116
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1117
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1118
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1119
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1120
+ LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1121
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff);
1122
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1123
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1124
+ LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1125
+ LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1087
1126
  }
1088
1127
 
1089
1128
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1111,7 +1150,7 @@ static void llama_model_load_internal(
1111
1150
  size_t ctx_size;
1112
1151
  size_t mmapped_size;
1113
1152
  ml->calc_sizes(&ctx_size, &mmapped_size);
1114
- fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
1153
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
1115
1154
 
1116
1155
  // create the ggml context
1117
1156
  {
@@ -1134,13 +1173,15 @@ static void llama_model_load_internal(
1134
1173
  }
1135
1174
 
1136
1175
  (void) main_gpu;
1176
+ (void) mul_mat_q;
1137
1177
  #if defined(GGML_USE_CUBLAS)
1138
- fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1178
+ LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
1139
1179
  ggml_cuda_set_main_device(main_gpu);
1180
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
1140
1181
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1141
1182
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1142
1183
  #elif defined(GGML_USE_CLBLAST)
1143
- fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1184
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
1144
1185
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1145
1186
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1146
1187
  #else
@@ -1230,25 +1271,29 @@ static void llama_model_load_internal(
1230
1271
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1231
1272
 
1232
1273
  // this is the total memory required to run the inference
1233
- const size_t mem_required =
1274
+ size_t mem_required =
1234
1275
  ctx_size +
1235
- mmapped_size - vram_weights + // weights in VRAM not in memory
1276
+ mmapped_size - vram_weights; // weights in VRAM not in memory
1277
+
1278
+ #ifndef LLAMA_USE_ALLOCATOR
1279
+ mem_required +=
1236
1280
  MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1237
1281
  MEM_REQ_SCRATCH1().at(model.type) +
1238
1282
  MEM_REQ_EVAL().at(model.type);
1283
+ #endif
1239
1284
 
1240
1285
  // this is the memory required by one llama_state
1241
1286
  const size_t mem_required_state =
1242
1287
  scale*hparams.kv_size();
1243
1288
 
1244
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1289
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1245
1290
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1246
1291
 
1247
1292
  (void) vram_scratch;
1248
1293
  (void) n_batch;
1249
1294
  #ifdef GGML_USE_CUBLAS
1250
1295
  if (low_vram) {
1251
- fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1296
+ LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1252
1297
  ggml_cuda_set_scratch_size(0); // disable scratch
1253
1298
  } else {
1254
1299
  const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
@@ -1256,7 +1301,7 @@ static void llama_model_load_internal(
1256
1301
  vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1257
1302
  ggml_cuda_set_scratch_size(vram_scratch);
1258
1303
  if (n_gpu_layers > 0) {
1259
- fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1304
+ LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1260
1305
  __func__, vram_scratch_base / kB, vram_scratch_per_context,
1261
1306
  (vram_scratch + MB - 1) / MB); // round up
1262
1307
  }
@@ -1266,9 +1311,9 @@ static void llama_model_load_internal(
1266
1311
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1267
1312
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1268
1313
 
1269
- fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1314
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1270
1315
  if (n_gpu_layers > (int) hparams.n_layer) {
1271
- fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1316
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
1272
1317
  }
1273
1318
  size_t vram_kv_cache = 0;
1274
1319
 
@@ -1277,17 +1322,17 @@ static void llama_model_load_internal(
1277
1322
  const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1278
1323
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1279
1324
  if (low_vram) {
1280
- fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1325
+ LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1281
1326
  } else {
1282
- fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1327
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1283
1328
  vram_kv_cache += hparams.kv_size() / 2;
1284
1329
  }
1285
1330
  }
1286
1331
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
1287
1332
  if (low_vram) {
1288
- fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1333
+ LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1289
1334
  } else {
1290
- fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1335
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1291
1336
  vram_kv_cache += hparams.kv_size() / 2;
1292
1337
  }
1293
1338
  }
@@ -1296,9 +1341,9 @@ static void llama_model_load_internal(
1296
1341
  const int max_offloadable_layers = hparams.n_layer + 1;
1297
1342
  #endif // GGML_USE_CUBLAS
1298
1343
 
1299
- fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1344
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
1300
1345
  __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1301
- fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1346
+ LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
1302
1347
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1303
1348
  #else
1304
1349
  (void) n_gpu_layers;
@@ -1341,6 +1386,7 @@ static bool llama_model_load(
1341
1386
  int n_gpu_layers,
1342
1387
  int main_gpu,
1343
1388
  const float * tensor_split,
1389
+ const bool mul_mat_q,
1344
1390
  float rope_freq_base,
1345
1391
  float rope_freq_scale,
1346
1392
  bool low_vram,
@@ -1351,41 +1397,25 @@ static bool llama_model_load(
1351
1397
  llama_progress_callback progress_callback,
1352
1398
  void *progress_callback_user_data) {
1353
1399
  try {
1354
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1400
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1401
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1355
1402
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1356
1403
  return true;
1357
1404
  } catch (const std::exception & err) {
1358
- fprintf(stderr, "error loading model: %s\n", err.what());
1405
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
1359
1406
  return false;
1360
1407
  }
1361
1408
  }
1362
1409
 
1363
- // evaluate the transformer
1364
- //
1365
- // - lctx: llama context
1366
- // - tokens: new batch of tokens to process
1367
- // - embd embeddings input
1368
- // - n_tokens number of tokens
1369
- // - n_past: the context size so far
1370
- // - n_threads: number of threads to use
1371
- //
1372
- static bool llama_eval_internal(
1410
+ static struct ggml_cgraph * llama_build_graph(
1373
1411
  llama_context & lctx,
1374
1412
  const llama_token * tokens,
1375
1413
  const float * embd,
1376
1414
  int n_tokens,
1377
- int n_past,
1378
- int n_threads,
1379
- const char * cgraph_fname) {
1415
+ int n_past) {
1380
1416
 
1381
1417
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1382
1418
 
1383
- #ifdef GGML_USE_MPI
1384
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1385
- #endif
1386
-
1387
- const int64_t t_start_us = ggml_time_us();
1388
-
1389
1419
  const int N = n_tokens;
1390
1420
 
1391
1421
  const auto & model = lctx.model;
@@ -1401,10 +1431,8 @@ static bool llama_eval_internal(
1401
1431
  const int64_t n_head = hparams.n_head;
1402
1432
  const int64_t n_head_kv = hparams.n_head_kv;
1403
1433
  const int64_t n_embd_head = hparams.n_embd_head();
1404
- const int64_t n_vocab = hparams.n_vocab;
1405
1434
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
1406
1435
 
1407
-
1408
1436
  LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1409
1437
 
1410
1438
  const float freq_base = hparams.rope_freq_base;
@@ -1416,26 +1444,35 @@ static bool llama_eval_internal(
1416
1444
  auto & mem_per_token = lctx.mem_per_token;
1417
1445
  auto & buf_compute = lctx.buf_compute;
1418
1446
 
1447
+
1419
1448
  struct ggml_init_params params = {
1420
1449
  /*.mem_size =*/ buf_compute.size,
1421
1450
  /*.mem_buffer =*/ buf_compute.addr,
1422
1451
  /*.no_alloc =*/ false,
1423
1452
  };
1424
1453
 
1454
+ #ifdef LLAMA_USE_ALLOCATOR
1455
+ params.no_alloc = true;
1456
+ #endif
1457
+
1425
1458
  struct ggml_context * ctx0 = ggml_init(params);
1426
1459
 
1427
1460
  ggml_cgraph * gf = ggml_new_graph(ctx0);
1428
1461
 
1429
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1430
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1431
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1432
-
1433
1462
  struct ggml_tensor * cur;
1434
1463
  struct ggml_tensor * inpL;
1435
1464
 
1436
1465
  if (tokens) {
1437
1466
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1467
+
1468
+ #ifdef LLAMA_USE_ALLOCATOR
1469
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
1470
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1471
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1472
+ }
1473
+ #else
1438
1474
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1475
+ #endif
1439
1476
  ggml_set_name(inp_tokens, "inp_tokens");
1440
1477
 
1441
1478
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@@ -1445,7 +1482,15 @@ static bool llama_eval_internal(
1445
1482
  #endif
1446
1483
 
1447
1484
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1485
+
1486
+ #ifdef LLAMA_USE_ALLOCATOR
1487
+ ggml_allocr_alloc(lctx.alloc, inpL);
1488
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1489
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1490
+ }
1491
+ #else
1448
1492
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1493
+ #endif
1449
1494
  }
1450
1495
 
1451
1496
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1472,6 +1517,17 @@ static bool llama_eval_internal(
1472
1517
  }
1473
1518
  #endif // GGML_USE_CUBLAS
1474
1519
 
1520
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1521
+ #ifdef LLAMA_USE_ALLOCATOR
1522
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
1523
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1524
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1525
+ }
1526
+ #else
1527
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1528
+ #endif
1529
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1530
+
1475
1531
  for (int il = 0; il < n_layer; ++il) {
1476
1532
  ggml_format_name(inpL, "layer_inp_%d", il);
1477
1533
 
@@ -1567,9 +1623,6 @@ static bool llama_eval_internal(
1567
1623
  ggml_set_name(KQ, "KQ");
1568
1624
 
1569
1625
  // KQ_scaled = KQ / sqrt(n_embd_head)
1570
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1571
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1572
-
1573
1626
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1574
1627
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1575
1628
  offload_func_kq(KQ_scaled);
@@ -1685,9 +1738,6 @@ static bool llama_eval_internal(
1685
1738
 
1686
1739
  lctx.use_buf(ctx0, 0);
1687
1740
 
1688
- // used at the end to optionally extract the embeddings
1689
- struct ggml_tensor * embeddings = NULL;
1690
-
1691
1741
  // norm
1692
1742
  {
1693
1743
  cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
@@ -1698,8 +1748,6 @@ static bool llama_eval_internal(
1698
1748
  cur = ggml_mul(ctx0, cur, model.norm);
1699
1749
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1700
1750
  ggml_set_name(cur, "result_norm");
1701
-
1702
- embeddings = cur;
1703
1751
  }
1704
1752
 
1705
1753
  // lm_head
@@ -1711,12 +1759,88 @@ static bool llama_eval_internal(
1711
1759
  // logits -> probs
1712
1760
  //cur = ggml_soft_max_inplace(ctx0, cur);
1713
1761
 
1714
- // run the computation
1715
1762
  ggml_build_forward_expand(gf, cur);
1716
1763
 
1717
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
1764
+ if (mem_per_token == 0) {
1765
+ mem_per_token = ggml_used_mem(ctx0)/N;
1766
+ }
1767
+
1768
+ #if 0
1769
+ LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1770
+ ggml_used_mem(ctx0)/1024.0/1024.0,
1771
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
1772
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1773
+ lctx.work_buffer.size()/1024.0/1024.0,
1774
+ n_past, N);
1775
+ #endif
1776
+
1777
+ ggml_free(ctx0);
1778
+
1779
+ return gf;
1780
+ }
1781
+
1782
+ // evaluate the transformer
1783
+ //
1784
+ // - lctx: llama context
1785
+ // - tokens: new batch of tokens to process
1786
+ // - embd embeddings input
1787
+ // - n_tokens number of tokens
1788
+ // - n_past: the context size so far
1789
+ // - n_threads: number of threads to use
1790
+ //
1791
+ static bool llama_eval_internal(
1792
+ llama_context & lctx,
1793
+ const llama_token * tokens,
1794
+ const float * embd,
1795
+ int n_tokens,
1796
+ int n_past,
1797
+ int n_threads,
1798
+ const char * cgraph_fname) {
1799
+
1800
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1801
+
1802
+ const int64_t t_start_us = ggml_time_us();
1803
+
1804
+ #ifdef GGML_USE_MPI
1805
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1806
+ #endif
1807
+
1808
+ const int N = n_tokens;
1809
+
1810
+ const auto & model = lctx.model;
1811
+ const auto & hparams = model.hparams;
1812
+
1813
+ const auto & kv_self = lctx.kv_self;
1814
+
1815
+ LLAMA_ASSERT(!!kv_self.ctx);
1816
+
1817
+ const int64_t n_embd = hparams.n_embd;
1818
+ const int64_t n_vocab = hparams.n_vocab;
1819
+
1820
+ #ifdef LLAMA_USE_ALLOCATOR
1821
+ ggml_allocr_reset(lctx.alloc);
1822
+ #endif
1823
+
1824
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
1825
+
1826
+ #ifdef LLAMA_USE_ALLOCATOR
1827
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
1828
+ #endif
1829
+
1830
+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1831
+
1832
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1833
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1834
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1835
+
1836
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1837
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
1838
+
1839
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
1840
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
1718
1841
 
1719
1842
  #if GGML_USE_MPI
1843
+ const int64_t n_layer = hparams.n_layer;
1720
1844
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1721
1845
  #endif
1722
1846
 
@@ -1728,7 +1852,10 @@ static bool llama_eval_internal(
1728
1852
  //}
1729
1853
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1730
1854
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
1731
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
1855
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
1856
+ if (!lctx.embedding.empty()) {
1857
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1858
+ }
1732
1859
  } else {
1733
1860
  // IMPORTANT:
1734
1861
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1759,8 +1886,6 @@ static bool llama_eval_internal(
1759
1886
  // update kv token count
1760
1887
  lctx.kv_self.n = n_past + N;
1761
1888
 
1762
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1763
-
1764
1889
  if (cgraph_fname) {
1765
1890
  ggml_graph_export(gf, cgraph_fname);
1766
1891
  }
@@ -1798,21 +1923,6 @@ static bool llama_eval_internal(
1798
1923
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1799
1924
  }
1800
1925
 
1801
- if (mem_per_token == 0) {
1802
- mem_per_token = ggml_used_mem(ctx0)/N;
1803
- }
1804
-
1805
- #if 0
1806
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1807
- ggml_used_mem(ctx0)/1024.0/1024.0,
1808
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
1809
- lctx.get_buf_max_mem(1)/1024.0/1024.0,
1810
- lctx.work_buffer.size()/1024.0/1024.0,
1811
- n_past, N);
1812
- #endif
1813
-
1814
- ggml_free(ctx0);
1815
-
1816
1926
  // measure the performance only for the single-token evals
1817
1927
  if (N == 1) {
1818
1928
  lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -1904,7 +2014,7 @@ struct llama_tokenizer {
1904
2014
  left_sym.n += right_sym.n;
1905
2015
  right_sym.n = 0;
1906
2016
 
1907
- //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
2017
+ //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
1908
2018
 
1909
2019
  // remove the right sym from the chain
1910
2020
  left_sym.next = right_sym.next;
@@ -1924,7 +2034,9 @@ struct llama_tokenizer {
1924
2034
  if (token == vocab_.token_to_id.end()) {
1925
2035
  // output any symbols that did not form tokens as bytes.
1926
2036
  for (int j = 0; j < (int) symbol.n; ++j) {
1927
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2037
+ // NOTE: old version, before #2420 - not sure what are the implications of this
2038
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2039
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
1928
2040
  output.push_back(token_id);
1929
2041
  }
1930
2042
  } else {
@@ -2910,7 +3022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2910
3022
  tensor.data = read_data.addr;
2911
3023
  model_loader->load_data_for(tensor);
2912
3024
 
2913
- printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
3025
+ LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
2914
3026
  ++idx, model_loader->tensors_map.tensors.size(),
2915
3027
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
2916
3028
  ggml_type_name(tensor.type));
@@ -2932,7 +3044,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2932
3044
  new_type = tensor.type;
2933
3045
  new_data = tensor.data;
2934
3046
  new_size = tensor.size;
2935
- printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
3047
+ LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
2936
3048
  } else {
2937
3049
  new_type = quantized_type;
2938
3050
  #ifdef GGML_USE_K_QUANTS
@@ -2967,17 +3079,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2967
3079
  int nx = tensor.ne.at(0);
2968
3080
  int ny = tensor.ne.at(1);
2969
3081
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2970
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3082
+ LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2971
3083
  convert_incompatible_tensor = true;
2972
3084
  }
2973
3085
  }
2974
3086
  if (convert_incompatible_tensor) {
2975
3087
  if (tensor.name == "output.weight") {
2976
3088
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2977
- fprintf(stderr, "F16 will be used for this tensor instead.\n");
3089
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
2978
3090
  } else if (tensor.name == "tok_embeddings.weight") {
2979
3091
  new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2980
- fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
3092
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
2981
3093
  } else {
2982
3094
  throw std::runtime_error("Unsupported tensor size encountered\n");
2983
3095
  }
@@ -2997,7 +3109,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2997
3109
  f32_data = (float *) f32_conv_buf.addr;
2998
3110
  }
2999
3111
 
3000
- printf("quantizing to %s .. ", ggml_type_name(new_type));
3112
+ LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
3001
3113
  fflush(stdout);
3002
3114
 
3003
3115
  work.resize(nelements * 4); // upper bound on size
@@ -3047,7 +3159,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3047
3159
  }
3048
3160
  }
3049
3161
 
3050
- printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
3162
+ LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
3051
3163
  int64_t tot_count = 0;
3052
3164
  for (size_t i = 0; i < hist_cur.size(); i++) {
3053
3165
  hist_all[i] += hist_cur[i];
@@ -3056,18 +3168,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3056
3168
 
3057
3169
  if (tot_count > 0) {
3058
3170
  for (size_t i = 0; i < hist_cur.size(); i++) {
3059
- printf("%5.3f ", hist_cur[i] / float(nelements));
3171
+ LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
3060
3172
  }
3061
3173
  }
3062
- printf("\n");
3174
+ LLAMA_LOG_INFO("\n");
3063
3175
  }
3064
3176
  total_size_org += tensor.size;
3065
3177
  total_size_new += new_size;
3066
3178
  file_saver.write_tensor(tensor, new_type, new_data, new_size);
3067
3179
  }
3068
3180
 
3069
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
3070
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
3181
+ LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
3182
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
3071
3183
 
3072
3184
  {
3073
3185
  int64_t sum_all = 0;
@@ -3076,11 +3188,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3076
3188
  }
3077
3189
 
3078
3190
  if (sum_all > 0) {
3079
- printf("%s: hist: ", __func__);
3191
+ LLAMA_LOG_INFO("%s: hist: ", __func__);
3080
3192
  for (size_t i = 0; i < hist_all.size(); i++) {
3081
- printf("%5.3f ", hist_all[i] / float(sum_all));
3193
+ LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
3082
3194
  }
3083
- printf("\n");
3195
+ LLAMA_LOG_INFO("\n");
3084
3196
  }
3085
3197
  }
3086
3198
  }
@@ -3101,11 +3213,11 @@ struct llama_model * llama_load_model_from_file(
3101
3213
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
3102
3214
 
3103
3215
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3104
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3216
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3105
3217
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3106
3218
  params.progress_callback_user_data)) {
3219
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
3107
3220
  delete model;
3108
- fprintf(stderr, "%s: failed to load model\n", __func__);
3109
3221
  return nullptr;
3110
3222
  }
3111
3223
 
@@ -3138,10 +3250,9 @@ struct llama_context * llama_new_context_with_model(
3138
3250
  unsigned percentage = (unsigned) (100 * progress);
3139
3251
  while (percentage > *cur_percentage_p) {
3140
3252
  *cur_percentage_p = percentage;
3141
- fprintf(stderr, ".");
3142
- fflush(stderr);
3253
+ LLAMA_LOG_INFO(".");
3143
3254
  if (percentage >= 100) {
3144
- fprintf(stderr, "\n");
3255
+ LLAMA_LOG_INFO("\n");
3145
3256
  }
3146
3257
  }
3147
3258
  };
@@ -3155,14 +3266,14 @@ struct llama_context * llama_new_context_with_model(
3155
3266
  // reserve memory for context buffers
3156
3267
  if (!params.vocab_only) {
3157
3268
  if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
3158
- fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
3269
+ LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
3159
3270
  llama_free(ctx);
3160
3271
  return nullptr;
3161
3272
  }
3162
3273
 
3163
3274
  {
3164
3275
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
3165
- fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
3276
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
3166
3277
  }
3167
3278
 
3168
3279
  const auto & hparams = ctx->model.hparams;
@@ -3178,10 +3289,47 @@ struct llama_context * llama_new_context_with_model(
3178
3289
  ctx->embedding.resize(hparams.n_embd);
3179
3290
  }
3180
3291
 
3292
+ #ifdef LLAMA_USE_ALLOCATOR
3293
+ {
3294
+ static const size_t tensor_alignment = 32;
3295
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
3296
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
3297
+
3298
+ // create measure allocator
3299
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
3300
+
3301
+ // build worst-case graph
3302
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
3303
+ int n_past = hparams.n_ctx - n_tokens;
3304
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3305
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3306
+
3307
+ // measure memory requirements for the graph
3308
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3309
+
3310
+ LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3311
+
3312
+ // debug - for comparison with scratch buffer
3313
+ //size_t prev_req =
3314
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3315
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3316
+ // MEM_REQ_EVAL().at(ctx->model.type);
3317
+ //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3318
+
3319
+ // recreate allocator with exact memory requirements
3320
+ ggml_allocr_free(ctx->alloc);
3321
+
3322
+ ctx->buf_alloc.resize(alloc_size);
3323
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3324
+ }
3325
+ #else
3181
3326
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
3327
+ #endif
3182
3328
 
3329
+ #ifdef LLAMA_USE_SCRATCH
3183
3330
  ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
3184
3331
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
3332
+ #endif
3185
3333
  }
3186
3334
 
3187
3335
  #ifdef GGML_USE_METAL
@@ -3202,13 +3350,13 @@ struct llama_context * llama_new_context_with_model(
3202
3350
 
3203
3351
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
3204
3352
 
3205
- fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3353
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3206
3354
 
3207
- #define LLAMA_METAL_CHECK_BUF(result) \
3208
- if (!(result)) { \
3209
- fprintf(stderr, "%s: failed to add buffer\n", __func__); \
3210
- llama_free(ctx); \
3211
- return NULL; \
3355
+ #define LLAMA_METAL_CHECK_BUF(result) \
3356
+ if (!(result)) { \
3357
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
3358
+ llama_free(ctx); \
3359
+ return NULL; \
3212
3360
  }
3213
3361
 
3214
3362
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
@@ -3251,9 +3399,6 @@ struct llama_context * llama_init_from_file(
3251
3399
  }
3252
3400
 
3253
3401
  void llama_free(struct llama_context * ctx) {
3254
- if (ctx->model_owner) {
3255
- delete &ctx->model;
3256
- }
3257
3402
  delete ctx;
3258
3403
  }
3259
3404
 
@@ -3265,19 +3410,19 @@ int llama_model_quantize(
3265
3410
  llama_model_quantize_internal(fname_inp, fname_out, params);
3266
3411
  return 0;
3267
3412
  } catch (const std::exception & err) {
3268
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
3413
+ LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
3269
3414
  return 1;
3270
3415
  }
3271
3416
  }
3272
3417
 
3273
3418
  int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
3274
- fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
3419
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
3275
3420
 
3276
3421
  const int64_t t_start_lora_us = ggml_time_us();
3277
3422
 
3278
3423
  auto fin = std::ifstream(path_lora, std::ios::binary);
3279
3424
  if (!fin) {
3280
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
3425
+ LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
3281
3426
  return 1;
3282
3427
  }
3283
3428
 
@@ -3286,14 +3431,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3286
3431
  uint32_t magic;
3287
3432
  fin.read((char *) &magic, sizeof(magic));
3288
3433
  if (magic != LLAMA_FILE_MAGIC_GGLA) {
3289
- fprintf(stderr, "%s: bad file magic\n", __func__);
3434
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
3290
3435
  return 1;
3291
3436
  }
3292
3437
  uint32_t format_version;
3293
3438
  fin.read((char *) &format_version, sizeof(format_version));
3294
3439
 
3295
3440
  if (format_version != 1) {
3296
- fprintf(stderr, "%s: unsupported file version\n", __func__ );
3441
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
3297
3442
  return 1;
3298
3443
  }
3299
3444
  }
@@ -3304,7 +3449,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3304
3449
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
3305
3450
  float scaling = (float)lora_alpha / (float)lora_r;
3306
3451
 
3307
- fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
3452
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
3308
3453
 
3309
3454
 
3310
3455
  // create a temporary ggml context to store the lora tensors
@@ -3330,7 +3475,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3330
3475
  ggml_context * base_ctx = NULL;
3331
3476
  llama_buffer base_buf;
3332
3477
  if (path_base_model) {
3333
- fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
3478
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
3334
3479
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
3335
3480
 
3336
3481
  size_t ctx_size;
@@ -3387,17 +3532,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3387
3532
  const std::string lora_suffix = ".lora";
3388
3533
  size_t pos = name.rfind(lora_suffix);
3389
3534
  if (pos == std::string::npos) {
3390
- fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
3535
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
3391
3536
  return 1;
3392
3537
  }
3393
3538
 
3394
3539
  std::string lora_type = name.substr(pos + lora_suffix.length());
3395
3540
  std::string base_name = name;
3396
3541
  base_name.erase(pos);
3397
- // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
3542
+ // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
3398
3543
 
3399
3544
  if (model_tensors.find(base_name) == model_tensors.end()) {
3400
- fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
3545
+ LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
3401
3546
  return 1;
3402
3547
  }
3403
3548
 
@@ -3408,7 +3553,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3408
3553
  case 1: wtype = GGML_TYPE_F16; break;
3409
3554
  default:
3410
3555
  {
3411
- fprintf(stderr, "%s: invalid tensor data type '%d'\n",
3556
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
3412
3557
  __func__, ftype);
3413
3558
  return false;
3414
3559
  }
@@ -3418,7 +3563,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3418
3563
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
3419
3564
  }
3420
3565
  else {
3421
- fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
3566
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
3422
3567
  return 1;
3423
3568
  }
3424
3569
  ggml_set_name(lora_tensor, "lora_tensor");
@@ -3456,7 +3601,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3456
3601
  if (model_loader) {
3457
3602
  // load from base model
3458
3603
  if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
3459
- fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
3604
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
3460
3605
  return 1;
3461
3606
  }
3462
3607
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
@@ -3472,8 +3617,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3472
3617
 
3473
3618
  if (ggml_is_quantized(base_t->type)) {
3474
3619
  if (!warned) {
3475
- fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
3476
- "use a f16 or f32 base model with --lora-base\n", __func__);
3620
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
3621
+ "use a f16 or f32 base model with --lora-base\n", __func__);
3477
3622
  warned = true;
3478
3623
  }
3479
3624
  }
@@ -3487,8 +3632,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3487
3632
  ggml_set_name(loraB, "loraB");
3488
3633
 
3489
3634
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3490
- fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
3491
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
3635
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
3636
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
3492
3637
  return 1;
3493
3638
  }
3494
3639
 
@@ -3533,7 +3678,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3533
3678
 
3534
3679
  n_tensors++;
3535
3680
  if (n_tensors % 4 == 0) {
3536
- fprintf(stderr, ".");
3681
+ LLAMA_LOG_INFO(".");
3537
3682
  }
3538
3683
  }
3539
3684
  }
@@ -3545,7 +3690,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3545
3690
  }
3546
3691
 
3547
3692
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
3548
- fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
3693
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
3549
3694
 
3550
3695
  return 0;
3551
3696
  }
@@ -3554,7 +3699,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3554
3699
  try {
3555
3700
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3556
3701
  } catch (const std::exception & err) {
3557
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3702
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
3558
3703
  return 1;
3559
3704
  }
3560
3705
  }
@@ -3563,7 +3708,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
3563
3708
  try {
3564
3709
  return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3565
3710
  } catch (const std::exception & err) {
3566
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3711
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
3567
3712
  return 1;
3568
3713
  }
3569
3714
  }
@@ -3612,10 +3757,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3612
3757
  return s_total;
3613
3758
  }
3614
3759
 
3615
- // Copies the state to the specified destination address
3616
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3617
- uint8_t * out = dst;
3618
-
3760
+ /** copy state data into either a buffer or file depending on the passed in context
3761
+ *
3762
+ * file context:
3763
+ * llama_file file("/path", "wb");
3764
+ * llama_data_file_context data_ctx(&file);
3765
+ * llama_copy_state_data(ctx, &data_ctx);
3766
+ *
3767
+ * buffer context:
3768
+ * std::vector<uint8_t> buf(max_size, 0);
3769
+ * llama_data_buffer_context data_ctx(&buf.data());
3770
+ * llama_copy_state_data(ctx, &data_ctx);
3771
+ *
3772
+ */
3773
+ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
3619
3774
  // copy rng
3620
3775
  {
3621
3776
  std::stringstream rng_ss;
@@ -3627,8 +3782,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3627
3782
  memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
3628
3783
  memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
3629
3784
 
3630
- memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
3631
- memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
3785
+ data_ctx->write(&rng_size, sizeof(rng_size));
3786
+ data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
3632
3787
  }
3633
3788
 
3634
3789
  // copy logits
@@ -3636,25 +3791,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3636
3791
  const size_t logits_cap = ctx->logits.capacity();
3637
3792
  const size_t logits_size = ctx->logits.size();
3638
3793
 
3639
- memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
3640
- memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
3794
+ data_ctx->write(&logits_cap, sizeof(logits_cap));
3795
+ data_ctx->write(&logits_size, sizeof(logits_size));
3641
3796
 
3642
3797
  if (logits_size) {
3643
- memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
3798
+ data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
3644
3799
  }
3645
3800
 
3646
- out += logits_cap * sizeof(float);
3801
+ // If there is a gap between the size and the capacity, write padding
3802
+ size_t padding_size = (logits_cap - logits_size) * sizeof(float);
3803
+ if (padding_size > 0) {
3804
+ std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
3805
+ data_ctx->write(padding.data(), padding_size);
3806
+ }
3647
3807
  }
3648
3808
 
3649
3809
  // copy embeddings
3650
3810
  {
3651
3811
  const size_t embedding_size = ctx->embedding.size();
3652
3812
 
3653
- memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
3813
+ data_ctx->write(&embedding_size, sizeof(embedding_size));
3654
3814
 
3655
3815
  if (embedding_size) {
3656
- memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
3657
- out += embedding_size * sizeof(float);
3816
+ data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
3658
3817
  }
3659
3818
  }
3660
3819
 
@@ -3663,14 +3822,14 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3663
3822
  const auto & kv_self = ctx->kv_self;
3664
3823
  const auto & hparams = ctx->model.hparams;
3665
3824
  const int n_layer = hparams.n_layer;
3666
- const int n_embd = hparams.n_embd;
3825
+ const int n_embd = hparams.n_embd_gqa();
3667
3826
  const int n_ctx = hparams.n_ctx;
3668
3827
 
3669
3828
  const size_t kv_size = kv_self.buf.size;
3670
3829
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
3671
3830
 
3672
- memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
3673
- memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
3831
+ data_ctx->write(&kv_size, sizeof(kv_size));
3832
+ data_ctx->write(&kv_ntok, sizeof(kv_ntok));
3674
3833
 
3675
3834
  if (kv_size) {
3676
3835
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -3679,12 +3838,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3679
3838
  ggml_cgraph gf{};
3680
3839
 
3681
3840
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3682
- kout3d->data = out;
3683
- out += ggml_nbytes(kout3d);
3841
+ std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
3842
+ kout3d->data = kout3d_data.data();
3684
3843
 
3685
3844
  ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
3686
- vout3d->data = out;
3687
- out += ggml_nbytes(vout3d);
3845
+ std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
3846
+ vout3d->data = vout3d_data.data();
3688
3847
 
3689
3848
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
3690
3849
  n_embd, kv_ntok, n_layer,
@@ -3699,15 +3858,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3699
3858
  ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3700
3859
 
3701
3860
  ggml_free(cpy_ctx);
3861
+
3862
+ // our data is now in the kout3d_data and vout3d_data buffers
3863
+ // write them to file
3864
+ data_ctx->write(kout3d_data.data(), kout3d_data.size());
3865
+ data_ctx->write(vout3d_data.data(), vout3d_data.size());
3702
3866
  }
3703
3867
  }
3868
+ }
3704
3869
 
3705
- const size_t written = out - dst;
3706
- const size_t max_size = llama_get_state_size(ctx);
3707
-
3708
- LLAMA_ASSERT(written <= max_size);
3870
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3871
+ llama_data_buffer_context data_ctx(dst);
3872
+ llama_copy_state_data_internal(ctx, &data_ctx);
3709
3873
 
3710
- return written;
3874
+ return data_ctx.get_size_written();
3711
3875
  }
3712
3876
 
3713
3877
  // Sets the state reading from the specified source address
@@ -3766,7 +3930,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3766
3930
  const auto & kv_self = ctx->kv_self;
3767
3931
  const auto & hparams = ctx->model.hparams;
3768
3932
  const int n_layer = hparams.n_layer;
3769
- const int n_embd = hparams.n_embd;
3933
+ const int n_embd = hparams.n_embd_gqa();
3770
3934
  const int n_ctx = hparams.n_ctx;
3771
3935
 
3772
3936
  size_t kv_size;
@@ -3826,7 +3990,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3826
3990
  const uint32_t version = file.read_u32();
3827
3991
 
3828
3992
  if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
3829
- fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
3993
+ LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
3830
3994
  return false;
3831
3995
  }
3832
3996
 
@@ -3834,7 +3998,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3834
3998
  file.read_raw(&session_hparams, sizeof(llama_hparams));
3835
3999
 
3836
4000
  if (session_hparams != ctx->model.hparams) {
3837
- fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
4001
+ LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
3838
4002
  return false;
3839
4003
  }
3840
4004
  }
@@ -3844,7 +4008,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3844
4008
  const uint32_t n_token_count = file.read_u32();
3845
4009
 
3846
4010
  if (n_token_count > n_token_capacity) {
3847
- fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
4011
+ LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
3848
4012
  return false;
3849
4013
  }
3850
4014
 
@@ -3858,7 +4022,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3858
4022
  const size_t n_state_size_max = llama_get_state_size(ctx);
3859
4023
 
3860
4024
  if (n_state_size_cur > n_state_size_max) {
3861
- fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
4025
+ LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
3862
4026
  return false;
3863
4027
  }
3864
4028
 
@@ -3875,7 +4039,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3875
4039
  try {
3876
4040
  return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3877
4041
  } catch (const std::exception & err) {
3878
- fprintf(stderr, "error loading session file: %s\n", err.what());
4042
+ LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
3879
4043
  return false;
3880
4044
  }
3881
4045
  }
@@ -3892,15 +4056,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
3892
4056
  file.write_u32((uint32_t) n_token_count);
3893
4057
  file.write_raw(tokens, sizeof(llama_token) * n_token_count);
3894
4058
 
3895
- // save the context state
3896
- {
3897
- const size_t n_state_size_max = llama_get_state_size(ctx);
3898
-
3899
- std::vector<uint8_t> state_data(n_state_size_max);
3900
- const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
3901
-
3902
- file.write_raw(state_data.data(), n_state_size_cur);
3903
- }
4059
+ // save the context state using stream saving
4060
+ llama_data_file_context data_ctx(&file);
4061
+ llama_copy_state_data_internal(ctx, &data_ctx);
3904
4062
 
3905
4063
  return true;
3906
4064
  }
@@ -3912,7 +4070,7 @@ int llama_eval(
3912
4070
  int n_past,
3913
4071
  int n_threads) {
3914
4072
  if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3915
- fprintf(stderr, "%s: failed to eval\n", __func__);
4073
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
3916
4074
  return 1;
3917
4075
  }
3918
4076
 
@@ -3934,7 +4092,7 @@ int llama_eval_embd(
3934
4092
  int n_past,
3935
4093
  int n_threads) {
3936
4094
  if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3937
- fprintf(stderr, "%s: failed to eval\n", __func__);
4095
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
3938
4096
  return 1;
3939
4097
  }
3940
4098
 
@@ -3955,7 +4113,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3955
4113
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3956
4114
 
3957
4115
  if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3958
- fprintf(stderr, "%s: failed to eval\n", __func__);
4116
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
3959
4117
  return 1;
3960
4118
  }
3961
4119
 
@@ -3971,7 +4129,7 @@ int llama_tokenize_with_model(
3971
4129
  auto res = llama_tokenize(model->vocab, text, add_bos);
3972
4130
 
3973
4131
  if (n_max_tokens < (int) res.size()) {
3974
- fprintf(stderr, "%s: too many tokens\n", __func__);
4132
+ LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3975
4133
  return -((int) res.size());
3976
4134
  }
3977
4135
 
@@ -4088,15 +4246,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
4088
4246
  void llama_print_timings(struct llama_context * ctx) {
4089
4247
  const llama_timings timings = llama_get_timings(ctx);
4090
4248
 
4091
- fprintf(stderr, "\n");
4092
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
4093
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4249
+ LLAMA_LOG_INFO("\n");
4250
+ LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
4251
+ LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4094
4252
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
4095
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
4253
+ LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
4096
4254
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
4097
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4255
+ LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4098
4256
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
4099
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
4257
+ LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
4100
4258
  }
4101
4259
 
4102
4260
  void llama_reset_timings(struct llama_context * ctx) {
@@ -4132,3 +4290,44 @@ const char * llama_print_system_info(void) {
4132
4290
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
4133
4291
  return ctx->model.tensors_by_name;
4134
4292
  }
4293
+
4294
+
4295
+ void llama_log_set(llama_log_callback log_callback, void * user_data) {
4296
+ g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
4297
+ g_state.log_callback_user_data = user_data;
4298
+ }
4299
+
4300
+ #if defined(_MSC_VER) && !defined(vsnprintf)
4301
+ #define vsnprintf _vsnprintf
4302
+ #endif
4303
+
4304
+ static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
4305
+ va_list args_copy;
4306
+ va_copy(args_copy, args);
4307
+ char buffer[128];
4308
+ int len = vsnprintf(buffer, 128, format, args);
4309
+ if (len < 128) {
4310
+ g_state.log_callback(level, buffer, g_state.log_callback_user_data);
4311
+ } else {
4312
+ char* buffer2 = new char[len+1];
4313
+ vsnprintf(buffer2, len+1, format, args_copy);
4314
+ buffer2[len] = 0;
4315
+ g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
4316
+ delete[] buffer2;
4317
+ }
4318
+ va_end(args_copy);
4319
+ }
4320
+
4321
+ static void llama_log_internal(llama_log_level level, const char * format, ...) {
4322
+ va_list args;
4323
+ va_start(args, format);
4324
+ llama_log_internal_v(level, format, args);
4325
+ va_end(args);
4326
+ }
4327
+
4328
+ static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
4329
+ (void) level;
4330
+ (void) user_data;
4331
+ fputs(text, stderr);
4332
+ fflush(stderr);
4333
+ }