llama_cpp 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,6 +56,13 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
+ static void llama_log_internal(llama_log_level level, const char* format, ...);
60
+ static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
61
+ #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
62
+ #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
63
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
64
+
65
+
59
66
  #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
60
67
  #include "ggml-alloc.h"
61
68
  #define LLAMA_USE_ALLOCATOR
@@ -149,7 +156,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
149
156
  }
150
157
 
151
158
  // amount of VRAM needed per batch size to hold temporary results
152
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
159
+ // the values for 3b are not derived from testing but instead chosen conservatively
153
160
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
154
161
  {
155
162
  static std::map<e_model, size_t> k_sizes = {
@@ -157,14 +164,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
157
164
  { MODEL_7B, 512ull * kB },
158
165
  { MODEL_13B, 640ull * kB },
159
166
  { MODEL_30B, 768ull * kB },
160
- { MODEL_65B, 1536ull * kB },
161
- { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
167
+ { MODEL_65B, 1280ull * kB },
168
+ { MODEL_70B, 1280ull * kB },
162
169
  };
163
170
  return k_sizes;
164
171
  }
165
172
 
166
173
  // amount of VRAM needed per batch size and context to hold temporary results
167
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
174
+ // the values for 3b are not derived from testing but instead chosen conservatively
168
175
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
169
176
  {
170
177
  static std::map<e_model, size_t> k_sizes = {
@@ -172,8 +179,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
172
179
  { MODEL_7B, 128ull },
173
180
  { MODEL_13B, 160ull },
174
181
  { MODEL_30B, 208ull },
175
- { MODEL_65B, 416ull },
176
- { MODEL_70B, 416ull }, // TODO (likely can be reduced)
182
+ { MODEL_65B, 256ull },
183
+ { MODEL_70B, 256ull },
177
184
  };
178
185
  return k_sizes;
179
186
  }
@@ -438,6 +445,14 @@ struct llama_context {
438
445
  }
439
446
  };
440
447
 
448
+ struct llama_state {
449
+ // We save the log callback globally
450
+ llama_log_callback log_callback = llama_log_callback_default;
451
+ void * log_callback_user_data = nullptr;
452
+ };
453
+ // global state
454
+ static llama_state g_state;
455
+
441
456
  template <typename T>
442
457
  static T checked_mul(T a, T b) {
443
458
  T ret = a * b;
@@ -504,7 +519,7 @@ struct llama_file_loader {
504
519
 
505
520
  llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
506
521
  : file(fname, "rb") {
507
- fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
522
+ LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
508
523
  read_magic();
509
524
  read_hparams();
510
525
  read_vocab();
@@ -619,7 +634,7 @@ struct llama_file_saver {
619
634
  llama_file_loader * any_file_loader;
620
635
  llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
621
636
  : file(fname, "wb"), any_file_loader(any_file_loader) {
622
- fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
637
+ LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
623
638
  write_magic();
624
639
  write_hparams(new_ftype);
625
640
  write_vocab();
@@ -640,7 +655,7 @@ struct llama_file_saver {
640
655
  }
641
656
  void write_vocab() {
642
657
  if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
643
- fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
658
+ LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
644
659
  }
645
660
  uint32_t n_vocab = any_file_loader->hparams.n_vocab;
646
661
  for (uint32_t i = 0; i < n_vocab; i++) {
@@ -747,12 +762,12 @@ struct llama_model_loader {
747
762
 
748
763
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
749
764
  size_t data_size = 0;
750
- size_t prefetch_size = 0;
765
+ size_t prefetch_size = file_loader->file.size;
751
766
  size_t lock_size = 0;
752
767
  for (const llama_load_tensor & lt : tensors_map.tensors) {
753
768
  data_size += lt.size;
754
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
755
- prefetch_size += lt.size;
769
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
770
+ prefetch_size -= lt.size;
756
771
  }
757
772
  }
758
773
 
@@ -831,7 +846,7 @@ struct llama_model_loader {
831
846
  uint8_t byte = lt.data[i];
832
847
  sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
833
848
  }
834
- fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
849
+ LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
835
850
  llama_format_tensor_shape(lt.ne).c_str(), lt.size);
836
851
  }
837
852
 
@@ -864,7 +879,7 @@ static bool kv_cache_init(
864
879
  cache.ctx = ggml_init(params);
865
880
 
866
881
  if (!cache.ctx) {
867
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
882
+ LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
868
883
  return false;
869
884
  }
870
885
 
@@ -1076,7 +1091,7 @@ static void llama_model_load_internal(
1076
1091
  LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1077
1092
  hparams.n_head_kv = hparams.n_head / n_gqa;
1078
1093
  if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1079
- fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1094
+ LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1080
1095
  model.type = e_model::MODEL_70B;
1081
1096
  hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1082
1097
  }
@@ -1092,22 +1107,22 @@ static void llama_model_load_internal(
1092
1107
  //const uint32_t n_ff = 28672;
1093
1108
 
1094
1109
  {
1095
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1096
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1097
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1098
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1099
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1100
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1101
- fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1102
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1103
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1104
- fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1105
- fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1106
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1107
- fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1108
- fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1109
- fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1110
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1110
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version));
1111
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1112
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1113
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1114
+ LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult);
1115
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1116
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1117
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1118
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1119
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1120
+ LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1121
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff);
1122
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1123
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1124
+ LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1125
+ LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111
1126
  }
1112
1127
 
1113
1128
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1135,7 +1150,7 @@ static void llama_model_load_internal(
1135
1150
  size_t ctx_size;
1136
1151
  size_t mmapped_size;
1137
1152
  ml->calc_sizes(&ctx_size, &mmapped_size);
1138
- fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
1153
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
1139
1154
 
1140
1155
  // create the ggml context
1141
1156
  {
@@ -1160,13 +1175,13 @@ static void llama_model_load_internal(
1160
1175
  (void) main_gpu;
1161
1176
  (void) mul_mat_q;
1162
1177
  #if defined(GGML_USE_CUBLAS)
1163
- fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1178
+ LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
1164
1179
  ggml_cuda_set_main_device(main_gpu);
1165
1180
  ggml_cuda_set_mul_mat_q(mul_mat_q);
1166
1181
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1167
1182
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1168
1183
  #elif defined(GGML_USE_CLBLAST)
1169
- fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1184
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
1170
1185
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1171
1186
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1172
1187
  #else
@@ -1271,14 +1286,14 @@ static void llama_model_load_internal(
1271
1286
  const size_t mem_required_state =
1272
1287
  scale*hparams.kv_size();
1273
1288
 
1274
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1289
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1275
1290
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1276
1291
 
1277
1292
  (void) vram_scratch;
1278
1293
  (void) n_batch;
1279
1294
  #ifdef GGML_USE_CUBLAS
1280
1295
  if (low_vram) {
1281
- fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1296
+ LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1282
1297
  ggml_cuda_set_scratch_size(0); // disable scratch
1283
1298
  } else {
1284
1299
  const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
@@ -1286,7 +1301,7 @@ static void llama_model_load_internal(
1286
1301
  vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1287
1302
  ggml_cuda_set_scratch_size(vram_scratch);
1288
1303
  if (n_gpu_layers > 0) {
1289
- fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1304
+ LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1290
1305
  __func__, vram_scratch_base / kB, vram_scratch_per_context,
1291
1306
  (vram_scratch + MB - 1) / MB); // round up
1292
1307
  }
@@ -1296,9 +1311,9 @@ static void llama_model_load_internal(
1296
1311
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1297
1312
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1298
1313
 
1299
- fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1314
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1300
1315
  if (n_gpu_layers > (int) hparams.n_layer) {
1301
- fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1316
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
1302
1317
  }
1303
1318
  size_t vram_kv_cache = 0;
1304
1319
 
@@ -1307,17 +1322,17 @@ static void llama_model_load_internal(
1307
1322
  const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1308
1323
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1309
1324
  if (low_vram) {
1310
- fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1325
+ LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1311
1326
  } else {
1312
- fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1327
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1313
1328
  vram_kv_cache += hparams.kv_size() / 2;
1314
1329
  }
1315
1330
  }
1316
1331
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
1317
1332
  if (low_vram) {
1318
- fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1333
+ LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1319
1334
  } else {
1320
- fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1335
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1321
1336
  vram_kv_cache += hparams.kv_size() / 2;
1322
1337
  }
1323
1338
  }
@@ -1326,9 +1341,9 @@ static void llama_model_load_internal(
1326
1341
  const int max_offloadable_layers = hparams.n_layer + 1;
1327
1342
  #endif // GGML_USE_CUBLAS
1328
1343
 
1329
- fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1344
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
1330
1345
  __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1331
- fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1346
+ LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
1332
1347
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1333
1348
  #else
1334
1349
  (void) n_gpu_layers;
@@ -1387,7 +1402,7 @@ static bool llama_model_load(
1387
1402
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1388
1403
  return true;
1389
1404
  } catch (const std::exception & err) {
1390
- fprintf(stderr, "error loading model: %s\n", err.what());
1405
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
1391
1406
  return false;
1392
1407
  }
1393
1408
  }
@@ -1751,7 +1766,7 @@ static struct ggml_cgraph * llama_build_graph(
1751
1766
  }
1752
1767
 
1753
1768
  #if 0
1754
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1769
+ LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1755
1770
  ggml_used_mem(ctx0)/1024.0/1024.0,
1756
1771
  lctx.get_buf_max_mem(0)/1024.0/1024.0,
1757
1772
  lctx.get_buf_max_mem(1)/1024.0/1024.0,
@@ -1812,7 +1827,7 @@ static bool llama_eval_internal(
1812
1827
  ggml_allocr_alloc_graph(lctx.alloc, gf);
1813
1828
  #endif
1814
1829
 
1815
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1830
+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1816
1831
 
1817
1832
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1818
1833
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1999,7 +2014,7 @@ struct llama_tokenizer {
1999
2014
  left_sym.n += right_sym.n;
2000
2015
  right_sym.n = 0;
2001
2016
 
2002
- //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
2017
+ //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
2003
2018
 
2004
2019
  // remove the right sym from the chain
2005
2020
  left_sym.next = right_sym.next;
@@ -3007,7 +3022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3007
3022
  tensor.data = read_data.addr;
3008
3023
  model_loader->load_data_for(tensor);
3009
3024
 
3010
- printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
3025
+ LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
3011
3026
  ++idx, model_loader->tensors_map.tensors.size(),
3012
3027
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
3013
3028
  ggml_type_name(tensor.type));
@@ -3029,7 +3044,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3029
3044
  new_type = tensor.type;
3030
3045
  new_data = tensor.data;
3031
3046
  new_size = tensor.size;
3032
- printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
3047
+ LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
3033
3048
  } else {
3034
3049
  new_type = quantized_type;
3035
3050
  #ifdef GGML_USE_K_QUANTS
@@ -3064,17 +3079,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3064
3079
  int nx = tensor.ne.at(0);
3065
3080
  int ny = tensor.ne.at(1);
3066
3081
  if (nx % QK_K != 0 || ny % QK_K != 0) {
3067
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3082
+ LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3068
3083
  convert_incompatible_tensor = true;
3069
3084
  }
3070
3085
  }
3071
3086
  if (convert_incompatible_tensor) {
3072
3087
  if (tensor.name == "output.weight") {
3073
3088
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
3074
- fprintf(stderr, "F16 will be used for this tensor instead.\n");
3089
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
3075
3090
  } else if (tensor.name == "tok_embeddings.weight") {
3076
3091
  new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
3077
- fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
3092
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
3078
3093
  } else {
3079
3094
  throw std::runtime_error("Unsupported tensor size encountered\n");
3080
3095
  }
@@ -3094,7 +3109,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3094
3109
  f32_data = (float *) f32_conv_buf.addr;
3095
3110
  }
3096
3111
 
3097
- printf("quantizing to %s .. ", ggml_type_name(new_type));
3112
+ LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
3098
3113
  fflush(stdout);
3099
3114
 
3100
3115
  work.resize(nelements * 4); // upper bound on size
@@ -3144,7 +3159,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3144
3159
  }
3145
3160
  }
3146
3161
 
3147
- printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
3162
+ LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
3148
3163
  int64_t tot_count = 0;
3149
3164
  for (size_t i = 0; i < hist_cur.size(); i++) {
3150
3165
  hist_all[i] += hist_cur[i];
@@ -3153,18 +3168,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3153
3168
 
3154
3169
  if (tot_count > 0) {
3155
3170
  for (size_t i = 0; i < hist_cur.size(); i++) {
3156
- printf("%5.3f ", hist_cur[i] / float(nelements));
3171
+ LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
3157
3172
  }
3158
3173
  }
3159
- printf("\n");
3174
+ LLAMA_LOG_INFO("\n");
3160
3175
  }
3161
3176
  total_size_org += tensor.size;
3162
3177
  total_size_new += new_size;
3163
3178
  file_saver.write_tensor(tensor, new_type, new_data, new_size);
3164
3179
  }
3165
3180
 
3166
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
3167
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
3181
+ LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
3182
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
3168
3183
 
3169
3184
  {
3170
3185
  int64_t sum_all = 0;
@@ -3173,11 +3188,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3173
3188
  }
3174
3189
 
3175
3190
  if (sum_all > 0) {
3176
- printf("%s: hist: ", __func__);
3191
+ LLAMA_LOG_INFO("%s: hist: ", __func__);
3177
3192
  for (size_t i = 0; i < hist_all.size(); i++) {
3178
- printf("%5.3f ", hist_all[i] / float(sum_all));
3193
+ LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
3179
3194
  }
3180
- printf("\n");
3195
+ LLAMA_LOG_INFO("\n");
3181
3196
  }
3182
3197
  }
3183
3198
  }
@@ -3201,8 +3216,8 @@ struct llama_model * llama_load_model_from_file(
3201
3216
  params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3202
3217
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3203
3218
  params.progress_callback_user_data)) {
3219
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
3204
3220
  delete model;
3205
- fprintf(stderr, "%s: failed to load model\n", __func__);
3206
3221
  return nullptr;
3207
3222
  }
3208
3223
 
@@ -3235,10 +3250,9 @@ struct llama_context * llama_new_context_with_model(
3235
3250
  unsigned percentage = (unsigned) (100 * progress);
3236
3251
  while (percentage > *cur_percentage_p) {
3237
3252
  *cur_percentage_p = percentage;
3238
- fprintf(stderr, ".");
3239
- fflush(stderr);
3253
+ LLAMA_LOG_INFO(".");
3240
3254
  if (percentage >= 100) {
3241
- fprintf(stderr, "\n");
3255
+ LLAMA_LOG_INFO("\n");
3242
3256
  }
3243
3257
  }
3244
3258
  };
@@ -3252,14 +3266,14 @@ struct llama_context * llama_new_context_with_model(
3252
3266
  // reserve memory for context buffers
3253
3267
  if (!params.vocab_only) {
3254
3268
  if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
3255
- fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
3269
+ LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
3256
3270
  llama_free(ctx);
3257
3271
  return nullptr;
3258
3272
  }
3259
3273
 
3260
3274
  {
3261
3275
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
3262
- fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
3276
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
3263
3277
  }
3264
3278
 
3265
3279
  const auto & hparams = ctx->model.hparams;
@@ -3293,14 +3307,14 @@ struct llama_context * llama_new_context_with_model(
3293
3307
  // measure memory requirements for the graph
3294
3308
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3295
3309
 
3296
- fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3310
+ LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3297
3311
 
3298
3312
  // debug - for comparison with scratch buffer
3299
3313
  //size_t prev_req =
3300
3314
  // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3301
3315
  // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3302
3316
  // MEM_REQ_EVAL().at(ctx->model.type);
3303
- //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3317
+ //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3304
3318
 
3305
3319
  // recreate allocator with exact memory requirements
3306
3320
  ggml_allocr_free(ctx->alloc);
@@ -3336,13 +3350,13 @@ struct llama_context * llama_new_context_with_model(
3336
3350
 
3337
3351
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
3338
3352
 
3339
- fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3353
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3340
3354
 
3341
- #define LLAMA_METAL_CHECK_BUF(result) \
3342
- if (!(result)) { \
3343
- fprintf(stderr, "%s: failed to add buffer\n", __func__); \
3344
- llama_free(ctx); \
3345
- return NULL; \
3355
+ #define LLAMA_METAL_CHECK_BUF(result) \
3356
+ if (!(result)) { \
3357
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
3358
+ llama_free(ctx); \
3359
+ return NULL; \
3346
3360
  }
3347
3361
 
3348
3362
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
@@ -3396,19 +3410,19 @@ int llama_model_quantize(
3396
3410
  llama_model_quantize_internal(fname_inp, fname_out, params);
3397
3411
  return 0;
3398
3412
  } catch (const std::exception & err) {
3399
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
3413
+ LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
3400
3414
  return 1;
3401
3415
  }
3402
3416
  }
3403
3417
 
3404
3418
  int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
3405
- fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
3419
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
3406
3420
 
3407
3421
  const int64_t t_start_lora_us = ggml_time_us();
3408
3422
 
3409
3423
  auto fin = std::ifstream(path_lora, std::ios::binary);
3410
3424
  if (!fin) {
3411
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
3425
+ LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
3412
3426
  return 1;
3413
3427
  }
3414
3428
 
@@ -3417,14 +3431,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3417
3431
  uint32_t magic;
3418
3432
  fin.read((char *) &magic, sizeof(magic));
3419
3433
  if (magic != LLAMA_FILE_MAGIC_GGLA) {
3420
- fprintf(stderr, "%s: bad file magic\n", __func__);
3434
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
3421
3435
  return 1;
3422
3436
  }
3423
3437
  uint32_t format_version;
3424
3438
  fin.read((char *) &format_version, sizeof(format_version));
3425
3439
 
3426
3440
  if (format_version != 1) {
3427
- fprintf(stderr, "%s: unsupported file version\n", __func__ );
3441
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
3428
3442
  return 1;
3429
3443
  }
3430
3444
  }
@@ -3435,7 +3449,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3435
3449
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
3436
3450
  float scaling = (float)lora_alpha / (float)lora_r;
3437
3451
 
3438
- fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
3452
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
3439
3453
 
3440
3454
 
3441
3455
  // create a temporary ggml context to store the lora tensors
@@ -3461,7 +3475,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3461
3475
  ggml_context * base_ctx = NULL;
3462
3476
  llama_buffer base_buf;
3463
3477
  if (path_base_model) {
3464
- fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
3478
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
3465
3479
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
3466
3480
 
3467
3481
  size_t ctx_size;
@@ -3518,17 +3532,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3518
3532
  const std::string lora_suffix = ".lora";
3519
3533
  size_t pos = name.rfind(lora_suffix);
3520
3534
  if (pos == std::string::npos) {
3521
- fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
3535
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
3522
3536
  return 1;
3523
3537
  }
3524
3538
 
3525
3539
  std::string lora_type = name.substr(pos + lora_suffix.length());
3526
3540
  std::string base_name = name;
3527
3541
  base_name.erase(pos);
3528
- // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
3542
+ // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
3529
3543
 
3530
3544
  if (model_tensors.find(base_name) == model_tensors.end()) {
3531
- fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
3545
+ LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
3532
3546
  return 1;
3533
3547
  }
3534
3548
 
@@ -3539,7 +3553,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3539
3553
  case 1: wtype = GGML_TYPE_F16; break;
3540
3554
  default:
3541
3555
  {
3542
- fprintf(stderr, "%s: invalid tensor data type '%d'\n",
3556
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
3543
3557
  __func__, ftype);
3544
3558
  return false;
3545
3559
  }
@@ -3549,7 +3563,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3549
3563
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
3550
3564
  }
3551
3565
  else {
3552
- fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
3566
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
3553
3567
  return 1;
3554
3568
  }
3555
3569
  ggml_set_name(lora_tensor, "lora_tensor");
@@ -3587,7 +3601,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3587
3601
  if (model_loader) {
3588
3602
  // load from base model
3589
3603
  if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
3590
- fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
3604
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
3591
3605
  return 1;
3592
3606
  }
3593
3607
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
@@ -3603,8 +3617,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3603
3617
 
3604
3618
  if (ggml_is_quantized(base_t->type)) {
3605
3619
  if (!warned) {
3606
- fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
3607
- "use a f16 or f32 base model with --lora-base\n", __func__);
3620
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
3621
+ "use a f16 or f32 base model with --lora-base\n", __func__);
3608
3622
  warned = true;
3609
3623
  }
3610
3624
  }
@@ -3618,8 +3632,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3618
3632
  ggml_set_name(loraB, "loraB");
3619
3633
 
3620
3634
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3621
- fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
3622
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
3635
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
3636
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
3623
3637
  return 1;
3624
3638
  }
3625
3639
 
@@ -3664,7 +3678,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3664
3678
 
3665
3679
  n_tensors++;
3666
3680
  if (n_tensors % 4 == 0) {
3667
- fprintf(stderr, ".");
3681
+ LLAMA_LOG_INFO(".");
3668
3682
  }
3669
3683
  }
3670
3684
  }
@@ -3676,7 +3690,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3676
3690
  }
3677
3691
 
3678
3692
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
3679
- fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
3693
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
3680
3694
 
3681
3695
  return 0;
3682
3696
  }
@@ -3685,7 +3699,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3685
3699
  try {
3686
3700
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3687
3701
  } catch (const std::exception & err) {
3688
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3702
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
3689
3703
  return 1;
3690
3704
  }
3691
3705
  }
@@ -3694,7 +3708,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
3694
3708
  try {
3695
3709
  return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3696
3710
  } catch (const std::exception & err) {
3697
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3711
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
3698
3712
  return 1;
3699
3713
  }
3700
3714
  }
@@ -3743,10 +3757,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3743
3757
  return s_total;
3744
3758
  }
3745
3759
 
3746
- // Copies the state to the specified destination address
3747
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3748
- uint8_t * out = dst;
3749
-
3760
+ /** copy state data into either a buffer or file depending on the passed in context
3761
+ *
3762
+ * file context:
3763
+ * llama_file file("/path", "wb");
3764
+ * llama_data_file_context data_ctx(&file);
3765
+ * llama_copy_state_data(ctx, &data_ctx);
3766
+ *
3767
+ * buffer context:
3768
+ * std::vector<uint8_t> buf(max_size, 0);
3769
+ * llama_data_buffer_context data_ctx(&buf.data());
3770
+ * llama_copy_state_data(ctx, &data_ctx);
3771
+ *
3772
+ */
3773
+ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
3750
3774
  // copy rng
3751
3775
  {
3752
3776
  std::stringstream rng_ss;
@@ -3758,8 +3782,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3758
3782
  memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
3759
3783
  memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
3760
3784
 
3761
- memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
3762
- memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
3785
+ data_ctx->write(&rng_size, sizeof(rng_size));
3786
+ data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
3763
3787
  }
3764
3788
 
3765
3789
  // copy logits
@@ -3767,25 +3791,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3767
3791
  const size_t logits_cap = ctx->logits.capacity();
3768
3792
  const size_t logits_size = ctx->logits.size();
3769
3793
 
3770
- memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
3771
- memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
3794
+ data_ctx->write(&logits_cap, sizeof(logits_cap));
3795
+ data_ctx->write(&logits_size, sizeof(logits_size));
3772
3796
 
3773
3797
  if (logits_size) {
3774
- memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
3798
+ data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
3775
3799
  }
3776
3800
 
3777
- out += logits_cap * sizeof(float);
3801
+ // If there is a gap between the size and the capacity, write padding
3802
+ size_t padding_size = (logits_cap - logits_size) * sizeof(float);
3803
+ if (padding_size > 0) {
3804
+ std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
3805
+ data_ctx->write(padding.data(), padding_size);
3806
+ }
3778
3807
  }
3779
3808
 
3780
3809
  // copy embeddings
3781
3810
  {
3782
3811
  const size_t embedding_size = ctx->embedding.size();
3783
3812
 
3784
- memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
3813
+ data_ctx->write(&embedding_size, sizeof(embedding_size));
3785
3814
 
3786
3815
  if (embedding_size) {
3787
- memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
3788
- out += embedding_size * sizeof(float);
3816
+ data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
3789
3817
  }
3790
3818
  }
3791
3819
 
@@ -3800,8 +3828,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3800
3828
  const size_t kv_size = kv_self.buf.size;
3801
3829
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
3802
3830
 
3803
- memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
3804
- memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
3831
+ data_ctx->write(&kv_size, sizeof(kv_size));
3832
+ data_ctx->write(&kv_ntok, sizeof(kv_ntok));
3805
3833
 
3806
3834
  if (kv_size) {
3807
3835
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -3810,12 +3838,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3810
3838
  ggml_cgraph gf{};
3811
3839
 
3812
3840
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3813
- kout3d->data = out;
3814
- out += ggml_nbytes(kout3d);
3841
+ std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
3842
+ kout3d->data = kout3d_data.data();
3815
3843
 
3816
3844
  ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
3817
- vout3d->data = out;
3818
- out += ggml_nbytes(vout3d);
3845
+ std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
3846
+ vout3d->data = vout3d_data.data();
3819
3847
 
3820
3848
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
3821
3849
  n_embd, kv_ntok, n_layer,
@@ -3830,15 +3858,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3830
3858
  ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3831
3859
 
3832
3860
  ggml_free(cpy_ctx);
3861
+
3862
+ // our data is now in the kout3d_data and vout3d_data buffers
3863
+ // write them to file
3864
+ data_ctx->write(kout3d_data.data(), kout3d_data.size());
3865
+ data_ctx->write(vout3d_data.data(), vout3d_data.size());
3833
3866
  }
3834
3867
  }
3868
+ }
3835
3869
 
3836
- const size_t written = out - dst;
3837
- const size_t max_size = llama_get_state_size(ctx);
3838
-
3839
- LLAMA_ASSERT(written <= max_size);
3870
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3871
+ llama_data_buffer_context data_ctx(dst);
3872
+ llama_copy_state_data_internal(ctx, &data_ctx);
3840
3873
 
3841
- return written;
3874
+ return data_ctx.get_size_written();
3842
3875
  }
3843
3876
 
3844
3877
  // Sets the state reading from the specified source address
@@ -3957,7 +3990,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3957
3990
  const uint32_t version = file.read_u32();
3958
3991
 
3959
3992
  if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
3960
- fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
3993
+ LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
3961
3994
  return false;
3962
3995
  }
3963
3996
 
@@ -3965,7 +3998,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3965
3998
  file.read_raw(&session_hparams, sizeof(llama_hparams));
3966
3999
 
3967
4000
  if (session_hparams != ctx->model.hparams) {
3968
- fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
4001
+ LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
3969
4002
  return false;
3970
4003
  }
3971
4004
  }
@@ -3975,7 +4008,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3975
4008
  const uint32_t n_token_count = file.read_u32();
3976
4009
 
3977
4010
  if (n_token_count > n_token_capacity) {
3978
- fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
4011
+ LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
3979
4012
  return false;
3980
4013
  }
3981
4014
 
@@ -3989,7 +4022,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3989
4022
  const size_t n_state_size_max = llama_get_state_size(ctx);
3990
4023
 
3991
4024
  if (n_state_size_cur > n_state_size_max) {
3992
- fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
4025
+ LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
3993
4026
  return false;
3994
4027
  }
3995
4028
 
@@ -4006,7 +4039,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
4006
4039
  try {
4007
4040
  return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
4008
4041
  } catch (const std::exception & err) {
4009
- fprintf(stderr, "error loading session file: %s\n", err.what());
4042
+ LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
4010
4043
  return false;
4011
4044
  }
4012
4045
  }
@@ -4023,15 +4056,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
4023
4056
  file.write_u32((uint32_t) n_token_count);
4024
4057
  file.write_raw(tokens, sizeof(llama_token) * n_token_count);
4025
4058
 
4026
- // save the context state
4027
- {
4028
- const size_t n_state_size_max = llama_get_state_size(ctx);
4029
-
4030
- std::vector<uint8_t> state_data(n_state_size_max);
4031
- const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
4032
-
4033
- file.write_raw(state_data.data(), n_state_size_cur);
4034
- }
4059
+ // save the context state using stream saving
4060
+ llama_data_file_context data_ctx(&file);
4061
+ llama_copy_state_data_internal(ctx, &data_ctx);
4035
4062
 
4036
4063
  return true;
4037
4064
  }
@@ -4043,7 +4070,7 @@ int llama_eval(
4043
4070
  int n_past,
4044
4071
  int n_threads) {
4045
4072
  if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
4046
- fprintf(stderr, "%s: failed to eval\n", __func__);
4073
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
4047
4074
  return 1;
4048
4075
  }
4049
4076
 
@@ -4065,7 +4092,7 @@ int llama_eval_embd(
4065
4092
  int n_past,
4066
4093
  int n_threads) {
4067
4094
  if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
4068
- fprintf(stderr, "%s: failed to eval\n", __func__);
4095
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
4069
4096
  return 1;
4070
4097
  }
4071
4098
 
@@ -4086,7 +4113,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
4086
4113
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
4087
4114
 
4088
4115
  if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
4089
- fprintf(stderr, "%s: failed to eval\n", __func__);
4116
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
4090
4117
  return 1;
4091
4118
  }
4092
4119
 
@@ -4102,7 +4129,7 @@ int llama_tokenize_with_model(
4102
4129
  auto res = llama_tokenize(model->vocab, text, add_bos);
4103
4130
 
4104
4131
  if (n_max_tokens < (int) res.size()) {
4105
- fprintf(stderr, "%s: too many tokens\n", __func__);
4132
+ LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
4106
4133
  return -((int) res.size());
4107
4134
  }
4108
4135
 
@@ -4219,15 +4246,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
4219
4246
  void llama_print_timings(struct llama_context * ctx) {
4220
4247
  const llama_timings timings = llama_get_timings(ctx);
4221
4248
 
4222
- fprintf(stderr, "\n");
4223
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
4224
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4249
+ LLAMA_LOG_INFO("\n");
4250
+ LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
4251
+ LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4225
4252
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
4226
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
4253
+ LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
4227
4254
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
4228
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4255
+ LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4229
4256
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
4230
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
4257
+ LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
4231
4258
  }
4232
4259
 
4233
4260
  void llama_reset_timings(struct llama_context * ctx) {
@@ -4263,3 +4290,44 @@ const char * llama_print_system_info(void) {
4263
4290
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
4264
4291
  return ctx->model.tensors_by_name;
4265
4292
  }
4293
+
4294
+
4295
+ void llama_log_set(llama_log_callback log_callback, void * user_data) {
4296
+ g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
4297
+ g_state.log_callback_user_data = user_data;
4298
+ }
4299
+
4300
+ #if defined(_MSC_VER) && !defined(vsnprintf)
4301
+ #define vsnprintf _vsnprintf
4302
+ #endif
4303
+
4304
+ static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
4305
+ va_list args_copy;
4306
+ va_copy(args_copy, args);
4307
+ char buffer[128];
4308
+ int len = vsnprintf(buffer, 128, format, args);
4309
+ if (len < 128) {
4310
+ g_state.log_callback(level, buffer, g_state.log_callback_user_data);
4311
+ } else {
4312
+ char* buffer2 = new char[len+1];
4313
+ vsnprintf(buffer2, len+1, format, args_copy);
4314
+ buffer2[len] = 0;
4315
+ g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
4316
+ delete[] buffer2;
4317
+ }
4318
+ va_end(args_copy);
4319
+ }
4320
+
4321
+ static void llama_log_internal(llama_log_level level, const char * format, ...) {
4322
+ va_list args;
4323
+ va_start(args, format);
4324
+ llama_log_internal_v(level, format, args);
4325
+ va_end(args);
4326
+ }
4327
+
4328
+ static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
4329
+ (void) level;
4330
+ (void) user_data;
4331
+ fputs(text, stderr);
4332
+ fflush(stderr);
4333
+ }