llama_cpp 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,14 @@
56
56
  #pragma warning(disable: 4244 4267) // possible loss of data
57
57
  #endif
58
58
 
59
- #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
59
+ static void llama_log_internal(llama_log_level level, const char* format, ...);
60
+ static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
61
+ #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
62
+ #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
63
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
64
+
65
+
66
+ #if !defined(GGML_USE_CUBLAS)
60
67
  #include "ggml-alloc.h"
61
68
  #define LLAMA_USE_ALLOCATOR
62
69
  #else
@@ -108,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
108
115
  // memory sizes (calculated for n_batch == 512)
109
116
  //
110
117
 
111
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
118
+ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
112
119
  {
113
- static std::map<e_model, size_t> k_sizes = {
120
+ std::map<e_model, size_t> k_sizes = {
114
121
  { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
115
122
  { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
116
123
  { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
@@ -149,7 +156,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
149
156
  }
150
157
 
151
158
  // amount of VRAM needed per batch size to hold temporary results
152
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
159
+ // the values for 3b are not derived from testing but instead chosen conservatively
153
160
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
154
161
  {
155
162
  static std::map<e_model, size_t> k_sizes = {
@@ -157,14 +164,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
157
164
  { MODEL_7B, 512ull * kB },
158
165
  { MODEL_13B, 640ull * kB },
159
166
  { MODEL_30B, 768ull * kB },
160
- { MODEL_65B, 1536ull * kB },
161
- { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
167
+ { MODEL_65B, 1280ull * kB },
168
+ { MODEL_70B, 1280ull * kB },
162
169
  };
163
170
  return k_sizes;
164
171
  }
165
172
 
166
173
  // amount of VRAM needed per batch size and context to hold temporary results
167
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
174
+ // the values for 3b are not derived from testing but instead chosen conservatively
168
175
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
169
176
  {
170
177
  static std::map<e_model, size_t> k_sizes = {
@@ -172,8 +179,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
172
179
  { MODEL_7B, 128ull },
173
180
  { MODEL_13B, 160ull },
174
181
  { MODEL_30B, 208ull },
175
- { MODEL_65B, 416ull },
176
- { MODEL_70B, 416ull }, // TODO (likely can be reduced)
182
+ { MODEL_65B, 256ull },
183
+ { MODEL_70B, 256ull },
177
184
  };
178
185
  return k_sizes;
179
186
  }
@@ -438,6 +445,14 @@ struct llama_context {
438
445
  }
439
446
  };
440
447
 
448
+ struct llama_state {
449
+ // We save the log callback globally
450
+ llama_log_callback log_callback = llama_log_callback_default;
451
+ void * log_callback_user_data = nullptr;
452
+ };
453
+ // global state
454
+ static llama_state g_state;
455
+
441
456
  template <typename T>
442
457
  static T checked_mul(T a, T b) {
443
458
  T ret = a * b;
@@ -504,7 +519,7 @@ struct llama_file_loader {
504
519
 
505
520
  llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
506
521
  : file(fname, "rb") {
507
- fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
522
+ LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
508
523
  read_magic();
509
524
  read_hparams();
510
525
  read_vocab();
@@ -619,7 +634,7 @@ struct llama_file_saver {
619
634
  llama_file_loader * any_file_loader;
620
635
  llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
621
636
  : file(fname, "wb"), any_file_loader(any_file_loader) {
622
- fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
637
+ LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
623
638
  write_magic();
624
639
  write_hparams(new_ftype);
625
640
  write_vocab();
@@ -640,7 +655,7 @@ struct llama_file_saver {
640
655
  }
641
656
  void write_vocab() {
642
657
  if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
643
- fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
658
+ LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
644
659
  }
645
660
  uint32_t n_vocab = any_file_loader->hparams.n_vocab;
646
661
  for (uint32_t i = 0; i < n_vocab; i++) {
@@ -747,12 +762,12 @@ struct llama_model_loader {
747
762
 
748
763
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
749
764
  size_t data_size = 0;
750
- size_t prefetch_size = 0;
765
+ size_t prefetch_size = file_loader->file.size;
751
766
  size_t lock_size = 0;
752
767
  for (const llama_load_tensor & lt : tensors_map.tensors) {
753
768
  data_size += lt.size;
754
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
755
- prefetch_size += lt.size;
769
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
770
+ prefetch_size -= lt.size;
756
771
  }
757
772
  }
758
773
 
@@ -831,7 +846,7 @@ struct llama_model_loader {
831
846
  uint8_t byte = lt.data[i];
832
847
  sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
833
848
  }
834
- fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
849
+ LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
835
850
  llama_format_tensor_shape(lt.ne).c_str(), lt.size);
836
851
  }
837
852
 
@@ -864,7 +879,7 @@ static bool kv_cache_init(
864
879
  cache.ctx = ggml_init(params);
865
880
 
866
881
  if (!cache.ctx) {
867
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
882
+ LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
868
883
  return false;
869
884
  }
870
885
 
@@ -969,7 +984,7 @@ int64_t llama_time_us() {
969
984
  // model loading
970
985
  //
971
986
 
972
- static const char *llama_file_version_name(llama_file_version version) {
987
+ static const char * llama_file_version_name(llama_file_version version) {
973
988
  switch (version) {
974
989
  case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
975
990
  case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
@@ -981,7 +996,7 @@ static const char *llama_file_version_name(llama_file_version version) {
981
996
  return "unknown";
982
997
  }
983
998
 
984
- static const char *llama_ftype_name(enum llama_ftype ftype) {
999
+ const char * llama_ftype_name(enum llama_ftype ftype) {
985
1000
  switch (ftype) {
986
1001
  case LLAMA_FTYPE_ALL_F32: return "all F32";
987
1002
  case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@@ -1006,7 +1021,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
1006
1021
  }
1007
1022
  }
1008
1023
 
1009
- static const char *llama_model_type_name(e_model type) {
1024
+ static const char * llama_model_type_name(e_model type) {
1010
1025
  switch (type) {
1011
1026
  case MODEL_3B: return "3B";
1012
1027
  case MODEL_7B: return "7B";
@@ -1076,7 +1091,7 @@ static void llama_model_load_internal(
1076
1091
  LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
1077
1092
  hparams.n_head_kv = hparams.n_head / n_gqa;
1078
1093
  if (model.type == e_model::MODEL_65B && n_gqa == 8) {
1079
- fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1094
+ LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
1080
1095
  model.type = e_model::MODEL_70B;
1081
1096
  hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
1082
1097
  }
@@ -1092,22 +1107,22 @@ static void llama_model_load_internal(
1092
1107
  //const uint32_t n_ff = 28672;
1093
1108
 
1094
1109
  {
1095
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1096
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1097
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1098
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1099
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1100
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1101
- fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1102
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1103
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1104
- fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1105
- fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1106
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1107
- fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1108
- fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1109
- fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1110
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1110
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(file_version));
1111
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1112
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1113
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1114
+ LLAMA_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult);
1115
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1116
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1117
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1118
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1119
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1120
+ LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
1121
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff);
1122
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1123
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1124
+ LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1125
+ LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1111
1126
  }
1112
1127
 
1113
1128
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1135,7 +1150,7 @@ static void llama_model_load_internal(
1135
1150
  size_t ctx_size;
1136
1151
  size_t mmapped_size;
1137
1152
  ml->calc_sizes(&ctx_size, &mmapped_size);
1138
- fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
1153
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
1139
1154
 
1140
1155
  // create the ggml context
1141
1156
  {
@@ -1160,13 +1175,13 @@ static void llama_model_load_internal(
1160
1175
  (void) main_gpu;
1161
1176
  (void) mul_mat_q;
1162
1177
  #if defined(GGML_USE_CUBLAS)
1163
- fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1178
+ LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
1164
1179
  ggml_cuda_set_main_device(main_gpu);
1165
1180
  ggml_cuda_set_mul_mat_q(mul_mat_q);
1166
1181
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1167
1182
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1168
1183
  #elif defined(GGML_USE_CLBLAST)
1169
- fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
1184
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
1170
1185
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1171
1186
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
1172
1187
  #else
@@ -1271,14 +1286,14 @@ static void llama_model_load_internal(
1271
1286
  const size_t mem_required_state =
1272
1287
  scale*hparams.kv_size();
1273
1288
 
1274
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1289
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1275
1290
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
1276
1291
 
1277
1292
  (void) vram_scratch;
1278
1293
  (void) n_batch;
1279
1294
  #ifdef GGML_USE_CUBLAS
1280
1295
  if (low_vram) {
1281
- fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1296
+ LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1282
1297
  ggml_cuda_set_scratch_size(0); // disable scratch
1283
1298
  } else {
1284
1299
  const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
@@ -1286,7 +1301,7 @@ static void llama_model_load_internal(
1286
1301
  vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1287
1302
  ggml_cuda_set_scratch_size(vram_scratch);
1288
1303
  if (n_gpu_layers > 0) {
1289
- fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1304
+ LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1290
1305
  __func__, vram_scratch_base / kB, vram_scratch_per_context,
1291
1306
  (vram_scratch + MB - 1) / MB); // round up
1292
1307
  }
@@ -1296,9 +1311,9 @@ static void llama_model_load_internal(
1296
1311
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1297
1312
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1298
1313
 
1299
- fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1314
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1300
1315
  if (n_gpu_layers > (int) hparams.n_layer) {
1301
- fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1316
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
1302
1317
  }
1303
1318
  size_t vram_kv_cache = 0;
1304
1319
 
@@ -1307,17 +1322,17 @@ static void llama_model_load_internal(
1307
1322
  const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1308
1323
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1309
1324
  if (low_vram) {
1310
- fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1325
+ LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1311
1326
  } else {
1312
- fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1327
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1313
1328
  vram_kv_cache += hparams.kv_size() / 2;
1314
1329
  }
1315
1330
  }
1316
1331
  if (n_gpu_layers > (int) hparams.n_layer + 2) {
1317
1332
  if (low_vram) {
1318
- fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1333
+ LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1319
1334
  } else {
1320
- fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1335
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1321
1336
  vram_kv_cache += hparams.kv_size() / 2;
1322
1337
  }
1323
1338
  }
@@ -1326,9 +1341,9 @@ static void llama_model_load_internal(
1326
1341
  const int max_offloadable_layers = hparams.n_layer + 1;
1327
1342
  #endif // GGML_USE_CUBLAS
1328
1343
 
1329
- fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1344
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
1330
1345
  __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1331
- fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1346
+ LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
1332
1347
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1333
1348
  #else
1334
1349
  (void) n_gpu_layers;
@@ -1387,7 +1402,7 @@ static bool llama_model_load(
1387
1402
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1388
1403
  return true;
1389
1404
  } catch (const std::exception & err) {
1390
- fprintf(stderr, "error loading model: %s\n", err.what());
1405
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
1391
1406
  return false;
1392
1407
  }
1393
1408
  }
@@ -1594,11 +1609,11 @@ static struct ggml_cgraph * llama_build_graph(
1594
1609
  ggml_set_name(Q, "Q");
1595
1610
 
1596
1611
  struct ggml_tensor * K =
1597
- ggml_permute(ctx0,
1598
- ggml_reshape_3d(ctx0,
1599
- ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
1600
- n_embd_head, n_head_kv, n_past + N),
1601
- 0, 2, 1, 3);
1612
+ ggml_view_3d(ctx0, kv_self.k,
1613
+ n_embd_head, n_past + N, n_head_kv,
1614
+ ggml_element_size(kv_self.k)*n_embd_gqa,
1615
+ ggml_element_size(kv_self.k)*n_embd_head,
1616
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
1602
1617
  offload_func_kq(K);
1603
1618
  ggml_set_name(K, "K");
1604
1619
 
@@ -1627,9 +1642,9 @@ static struct ggml_cgraph * llama_build_graph(
1627
1642
  struct ggml_tensor * V =
1628
1643
  ggml_view_3d(ctx0, kv_self.v,
1629
1644
  n_past + N, n_embd_head, n_head_kv,
1630
- n_ctx*ggml_element_size(kv_self.v),
1631
- n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
1632
- n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
1645
+ ggml_element_size(kv_self.v)*n_ctx,
1646
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
1647
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
1633
1648
  offload_func_v(V);
1634
1649
  ggml_set_name(V, "V");
1635
1650
 
@@ -1751,7 +1766,7 @@ static struct ggml_cgraph * llama_build_graph(
1751
1766
  }
1752
1767
 
1753
1768
  #if 0
1754
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1769
+ LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1755
1770
  ggml_used_mem(ctx0)/1024.0/1024.0,
1756
1771
  lctx.get_buf_max_mem(0)/1024.0/1024.0,
1757
1772
  lctx.get_buf_max_mem(1)/1024.0/1024.0,
@@ -1784,6 +1799,13 @@ static bool llama_eval_internal(
1784
1799
 
1785
1800
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1786
1801
 
1802
+ LLAMA_ASSERT(n_tokens > 0);
1803
+ LLAMA_ASSERT(n_past >= 0);
1804
+ LLAMA_ASSERT(n_threads > 0);
1805
+ // TODO: keep the values of n_batch and n_ctx
1806
+ // LLAMA_ASSERT(n_tokens <= n_batch);
1807
+ // LLAMA_ASSERT(n_past + n_tokens <= n_ctx);
1808
+
1787
1809
  const int64_t t_start_us = ggml_time_us();
1788
1810
 
1789
1811
  #ifdef GGML_USE_MPI
@@ -1812,7 +1834,7 @@ static bool llama_eval_internal(
1812
1834
  ggml_allocr_alloc_graph(lctx.alloc, gf);
1813
1835
  #endif
1814
1836
 
1815
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1837
+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1816
1838
 
1817
1839
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1818
1840
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1830,11 +1852,7 @@ static bool llama_eval_internal(
1830
1852
  #endif
1831
1853
 
1832
1854
  #ifdef GGML_USE_METAL
1833
- if (lctx.ctx_metal && N == 1) {
1834
- // TODO: disabled until #2413 is resolved
1835
- //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1836
- // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1837
- //}
1855
+ if (lctx.ctx_metal) {
1838
1856
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1839
1857
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
1840
1858
  ggml_metal_get_tensor (lctx.ctx_metal, res);
@@ -1842,22 +1860,6 @@ static bool llama_eval_internal(
1842
1860
  ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1843
1861
  }
1844
1862
  } else {
1845
- // IMPORTANT:
1846
- // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
1847
- // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX
1848
- // coprocessor.
1849
- //
1850
- // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
1851
- // But for now, we have focused only on Matrix x Vector Metal multiplication.
1852
- //
1853
- // TODO: avoid these syncs via shared memory (ref #1696)
1854
- //
1855
- if (lctx.ctx_metal) {
1856
- // We need to sync the GPU KV cache with the CPU KV cache
1857
- ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
1858
- ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1859
- }
1860
-
1861
1863
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1862
1864
  }
1863
1865
  #else
@@ -1999,7 +2001,7 @@ struct llama_tokenizer {
1999
2001
  left_sym.n += right_sym.n;
2000
2002
  right_sym.n = 0;
2001
2003
 
2002
- //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
2004
+ //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
2003
2005
 
2004
2006
  // remove the right sym from the chain
2005
2007
  left_sym.next = right_sym.next;
@@ -2082,37 +2084,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
2082
2084
  // grammar - internal
2083
2085
  //
2084
2086
 
2087
+ struct llama_partial_utf8 {
2088
+ uint32_t value; // bit value so far (unshifted)
2089
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
2090
+ };
2091
+
2085
2092
  struct llama_grammar {
2086
2093
  const std::vector<std::vector<llama_grammar_element>> rules;
2087
2094
  std::vector<std::vector<const llama_grammar_element *>> stacks;
2095
+
2096
+ // buffer for partially generated UTF-8 sequence from accepted tokens
2097
+ llama_partial_utf8 partial_utf8;
2088
2098
  };
2089
2099
 
2090
2100
  struct llama_grammar_candidate {
2091
- size_t index;
2092
- const uint32_t * code_points;
2101
+ size_t index;
2102
+ const uint32_t * code_points;
2103
+ llama_partial_utf8 partial_utf8;
2093
2104
  };
2094
2105
 
2095
- // NOTE: assumes valid utf8 (but checks for overrun)
2096
- // adds a terminating 0 for use as pointer
2097
- std::vector<uint32_t> decode_utf8(const char * src) {
2098
- static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
2106
+ // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
2107
+ // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
2108
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
2109
+ const char * src,
2110
+ llama_partial_utf8 partial_start) {
2111
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
2099
2112
  const char * pos = src;
2100
2113
  std::vector<uint32_t> code_points;
2114
+ uint32_t value = partial_start.value;
2115
+ int n_remain = partial_start.n_remain;
2116
+
2117
+ // continue previous decode, if applicable
2118
+ while (*pos != 0 && n_remain > 0) {
2119
+ uint8_t next_byte = static_cast<uint8_t>(*pos);
2120
+ if ((next_byte >> 6) != 2) {
2121
+ // invalid sequence, abort
2122
+ code_points.push_back(0);
2123
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
2124
+ }
2125
+ value = (value << 6) + (next_byte & 0x3F);
2126
+ ++pos;
2127
+ --n_remain;
2128
+ }
2129
+
2130
+ if (partial_start.n_remain > 0 && n_remain == 0) {
2131
+ code_points.push_back(value);
2132
+ }
2133
+
2134
+ // decode any subsequent utf-8 sequences, which may end in an incomplete one
2101
2135
  while (*pos != 0) {
2102
2136
  uint8_t first_byte = static_cast<uint8_t>(*pos);
2103
2137
  uint8_t highbits = first_byte >> 4;
2104
- int len = lookup[highbits];
2105
- uint8_t mask = (1 << (8 - len)) - 1;
2106
- uint32_t value = first_byte & mask;
2107
- const char * end = pos + len; // may overrun!
2138
+ n_remain = lookup[highbits] - 1;
2139
+
2140
+ if (n_remain < 0) {
2141
+ // invalid sequence, abort
2142
+ code_points.clear();
2143
+ code_points.push_back(0);
2144
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
2145
+ }
2146
+
2147
+ uint8_t mask = (1 << (7 - n_remain)) - 1;
2148
+ value = first_byte & mask;
2108
2149
  ++pos;
2109
- for ( ; pos < end && *pos != 0; ++pos) {
2150
+ while (*pos != 0 && n_remain > 0) {
2110
2151
  value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
2152
+ ++pos;
2153
+ --n_remain;
2154
+ }
2155
+ if (n_remain == 0) {
2156
+ code_points.push_back(value);
2111
2157
  }
2112
- code_points.push_back(value);
2113
2158
  }
2114
2159
  code_points.push_back(0);
2115
- return code_points;
2160
+
2161
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
2116
2162
  }
2117
2163
 
2118
2164
  // returns true iff pos points to the end of one of the definitions of a rule
@@ -2149,6 +2195,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
2149
2195
  return std::make_pair(found == is_positive_char, pos);
2150
2196
  }
2151
2197
 
2198
+ // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
2199
+ // range at pos (regular or inverse range)
2200
+ // asserts that pos is pointing to a char range element
2201
+ static bool llama_grammar_match_partial_char(
2202
+ const llama_grammar_element * pos,
2203
+ const llama_partial_utf8 partial_utf8) {
2204
+
2205
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
2206
+ LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
2207
+
2208
+ uint32_t partial_value = partial_utf8.value;
2209
+ int n_remain = partial_utf8.n_remain;
2210
+
2211
+ // invalid sequence or 7-bit char split across 2 bytes (overlong)
2212
+ if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
2213
+ return false;
2214
+ }
2215
+
2216
+ // range of possible code points this partial UTF-8 sequence could complete to
2217
+ uint32_t low = partial_value << (n_remain * 6);
2218
+ uint32_t high = low | ((1 << (n_remain * 6)) - 1);
2219
+
2220
+ if (low == 0) {
2221
+ if (n_remain == 2) {
2222
+ low = 1 << 11;
2223
+ } else if (n_remain == 3) {
2224
+ low = 1 << 16;
2225
+ }
2226
+ }
2227
+
2228
+ do {
2229
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
2230
+ // inclusive range, e.g. [a-z]
2231
+ if (pos->value <= high && low <= pos[1].value) {
2232
+ return is_positive_char;
2233
+ }
2234
+ pos += 2;
2235
+ } else {
2236
+ // exact char match, e.g. [a] or "a"
2237
+ if (low <= pos->value && pos->value <= high) {
2238
+ return is_positive_char;
2239
+ }
2240
+ pos += 1;
2241
+ }
2242
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
2243
+
2244
+ return !is_positive_char;
2245
+ }
2246
+
2247
+
2152
2248
  // transforms a grammar pushdown stack into N possible stacks, all ending
2153
2249
  // at a character range (terminal element)
2154
2250
  static void llama_grammar_advance_stack(
@@ -2249,8 +2345,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2249
2345
  std::vector<llama_grammar_candidate> rejects;
2250
2346
 
2251
2347
  if (stack.empty()) {
2252
- // accept nothing; EOS is handled elsewhere
2253
- rejects.insert(rejects.end(), candidates.begin(), candidates.end());
2348
+ for (auto tok : candidates) {
2349
+ if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
2350
+ rejects.push_back(tok);
2351
+ }
2352
+ }
2254
2353
  return rejects;
2255
2354
  }
2256
2355
 
@@ -2258,10 +2357,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2258
2357
 
2259
2358
  std::vector<llama_grammar_candidate> next_candidates;
2260
2359
  for (auto tok : candidates) {
2261
- if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
2262
- if (tok.code_points[1] != 0) {
2263
- next_candidates.push_back({ tok.index, tok.code_points + 1 });
2360
+ if (*tok.code_points == 0) {
2361
+ // reached end of full codepoints in token, reject iff it ended in a partial sequence
2362
+ // that cannot satisfy this position in grammar
2363
+ if (tok.partial_utf8.n_remain != 0 &&
2364
+ !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
2365
+ rejects.push_back(tok);
2264
2366
  }
2367
+ } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
2368
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
2265
2369
  } else {
2266
2370
  rejects.push_back(tok);
2267
2371
  }
@@ -2279,7 +2383,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
2279
2383
 
2280
2384
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
2281
2385
  for (auto tok : next_rejects) {
2282
- rejects.push_back({ tok.index, tok.code_points - 1 });
2386
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
2283
2387
  }
2284
2388
 
2285
2389
  return rejects;
@@ -2344,7 +2448,7 @@ struct llama_grammar * llama_grammar_init(
2344
2448
  }
2345
2449
  } while (true);
2346
2450
 
2347
- return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
2451
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
2348
2452
  }
2349
2453
 
2350
2454
  void llama_grammar_free(struct llama_grammar * grammar) {
@@ -2650,8 +2754,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2650
2754
 
2651
2755
  const llama_token eos = llama_token_eos();
2652
2756
 
2653
- std::vector<std::vector<uint32_t>> candidates_decoded;
2654
- std::vector<llama_grammar_candidate> candidates_grammar;
2757
+ std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
2758
+ std::vector<llama_grammar_candidate> candidates_grammar;
2655
2759
 
2656
2760
  for (size_t i = 0; i < candidates->size; ++i) {
2657
2761
  const llama_token id = candidates->data[i].id;
@@ -2663,8 +2767,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2663
2767
  } else if (*str == 0) {
2664
2768
  candidates->data[i].logit = -INFINITY;
2665
2769
  } else {
2666
- candidates_decoded.push_back(decode_utf8(str));
2667
- candidates_grammar.push_back({ i, candidates_decoded.back().data() });
2770
+ candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8));
2771
+ candidates_grammar.push_back({
2772
+ i, candidates_decoded.back().first.data(), candidates_decoded.back().second
2773
+ });
2668
2774
  }
2669
2775
  }
2670
2776
 
@@ -2865,11 +2971,14 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
2865
2971
  }
2866
2972
 
2867
2973
  const char * str = llama_token_to_str(ctx, token);
2974
+
2868
2975
  // Note terminating 0 in decoded string
2869
- auto code_points = decode_utf8(str);
2976
+ const auto decoded = decode_utf8(str, grammar->partial_utf8);
2977
+ const auto & code_points = decoded.first;
2870
2978
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
2871
2979
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
2872
2980
  }
2981
+ grammar->partial_utf8 = decoded.second;
2873
2982
  LLAMA_ASSERT(!grammar->stacks.empty());
2874
2983
 
2875
2984
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -3007,7 +3116,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3007
3116
  tensor.data = read_data.addr;
3008
3117
  model_loader->load_data_for(tensor);
3009
3118
 
3010
- printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
3119
+ LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
3011
3120
  ++idx, model_loader->tensors_map.tensors.size(),
3012
3121
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
3013
3122
  ggml_type_name(tensor.type));
@@ -3029,7 +3138,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3029
3138
  new_type = tensor.type;
3030
3139
  new_data = tensor.data;
3031
3140
  new_size = tensor.size;
3032
- printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
3141
+ LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
3033
3142
  } else {
3034
3143
  new_type = quantized_type;
3035
3144
  #ifdef GGML_USE_K_QUANTS
@@ -3064,17 +3173,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3064
3173
  int nx = tensor.ne.at(0);
3065
3174
  int ny = tensor.ne.at(1);
3066
3175
  if (nx % QK_K != 0 || ny % QK_K != 0) {
3067
- fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3176
+ LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
3068
3177
  convert_incompatible_tensor = true;
3069
3178
  }
3070
3179
  }
3071
3180
  if (convert_incompatible_tensor) {
3072
3181
  if (tensor.name == "output.weight") {
3073
3182
  new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
3074
- fprintf(stderr, "F16 will be used for this tensor instead.\n");
3183
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
3075
3184
  } else if (tensor.name == "tok_embeddings.weight") {
3076
3185
  new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
3077
- fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
3186
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
3078
3187
  } else {
3079
3188
  throw std::runtime_error("Unsupported tensor size encountered\n");
3080
3189
  }
@@ -3094,7 +3203,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3094
3203
  f32_data = (float *) f32_conv_buf.addr;
3095
3204
  }
3096
3205
 
3097
- printf("quantizing to %s .. ", ggml_type_name(new_type));
3206
+ LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
3098
3207
  fflush(stdout);
3099
3208
 
3100
3209
  work.resize(nelements * 4); // upper bound on size
@@ -3144,7 +3253,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3144
3253
  }
3145
3254
  }
3146
3255
 
3147
- printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
3256
+ LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
3148
3257
  int64_t tot_count = 0;
3149
3258
  for (size_t i = 0; i < hist_cur.size(); i++) {
3150
3259
  hist_all[i] += hist_cur[i];
@@ -3153,18 +3262,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3153
3262
 
3154
3263
  if (tot_count > 0) {
3155
3264
  for (size_t i = 0; i < hist_cur.size(); i++) {
3156
- printf("%5.3f ", hist_cur[i] / float(nelements));
3265
+ LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
3157
3266
  }
3158
3267
  }
3159
- printf("\n");
3268
+ LLAMA_LOG_INFO("\n");
3160
3269
  }
3161
3270
  total_size_org += tensor.size;
3162
3271
  total_size_new += new_size;
3163
3272
  file_saver.write_tensor(tensor, new_type, new_data, new_size);
3164
3273
  }
3165
3274
 
3166
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
3167
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
3275
+ LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
3276
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
3168
3277
 
3169
3278
  {
3170
3279
  int64_t sum_all = 0;
@@ -3173,11 +3282,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3173
3282
  }
3174
3283
 
3175
3284
  if (sum_all > 0) {
3176
- printf("%s: hist: ", __func__);
3285
+ LLAMA_LOG_INFO("%s: hist: ", __func__);
3177
3286
  for (size_t i = 0; i < hist_all.size(); i++) {
3178
- printf("%5.3f ", hist_all[i] / float(sum_all));
3287
+ LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
3179
3288
  }
3180
- printf("\n");
3289
+ LLAMA_LOG_INFO("\n");
3181
3290
  }
3182
3291
  }
3183
3292
  }
@@ -3201,8 +3310,8 @@ struct llama_model * llama_load_model_from_file(
3201
3310
  params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3202
3311
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3203
3312
  params.progress_callback_user_data)) {
3313
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
3204
3314
  delete model;
3205
- fprintf(stderr, "%s: failed to load model\n", __func__);
3206
3315
  return nullptr;
3207
3316
  }
3208
3317
 
@@ -3235,10 +3344,9 @@ struct llama_context * llama_new_context_with_model(
3235
3344
  unsigned percentage = (unsigned) (100 * progress);
3236
3345
  while (percentage > *cur_percentage_p) {
3237
3346
  *cur_percentage_p = percentage;
3238
- fprintf(stderr, ".");
3239
- fflush(stderr);
3347
+ LLAMA_LOG_INFO(".");
3240
3348
  if (percentage >= 100) {
3241
- fprintf(stderr, "\n");
3349
+ LLAMA_LOG_INFO("\n");
3242
3350
  }
3243
3351
  }
3244
3352
  };
@@ -3252,14 +3360,14 @@ struct llama_context * llama_new_context_with_model(
3252
3360
  // reserve memory for context buffers
3253
3361
  if (!params.vocab_only) {
3254
3362
  if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
3255
- fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
3363
+ LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
3256
3364
  llama_free(ctx);
3257
3365
  return nullptr;
3258
3366
  }
3259
3367
 
3260
3368
  {
3261
3369
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
3262
- fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
3370
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
3263
3371
  }
3264
3372
 
3265
3373
  const auto & hparams = ctx->model.hparams;
@@ -3289,24 +3397,40 @@ struct llama_context * llama_new_context_with_model(
3289
3397
  int n_past = hparams.n_ctx - n_tokens;
3290
3398
  llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3291
3399
  ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3292
-
3400
+ #ifdef GGML_USE_METAL
3401
+ if (params.n_gpu_layers > 0) {
3402
+ ctx->ctx_metal = ggml_metal_init(1);
3403
+ if (!ctx->ctx_metal) {
3404
+ LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3405
+ llama_free(ctx);
3406
+ return NULL;
3407
+ }
3408
+ ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
3409
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3410
+ }
3411
+ #endif
3293
3412
  // measure memory requirements for the graph
3294
3413
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3295
3414
 
3296
- fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3415
+ LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3297
3416
 
3298
3417
  // debug - for comparison with scratch buffer
3299
3418
  //size_t prev_req =
3300
3419
  // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3301
3420
  // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3302
3421
  // MEM_REQ_EVAL().at(ctx->model.type);
3303
- //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3422
+ //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3304
3423
 
3305
3424
  // recreate allocator with exact memory requirements
3306
3425
  ggml_allocr_free(ctx->alloc);
3307
3426
 
3308
3427
  ctx->buf_alloc.resize(alloc_size);
3309
3428
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3429
+ #ifdef GGML_USE_METAL
3430
+ if (ctx->ctx_metal) {
3431
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3432
+ }
3433
+ #endif
3310
3434
  }
3311
3435
  #else
3312
3436
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3321,7 +3445,6 @@ struct llama_context * llama_new_context_with_model(
3321
3445
  #ifdef GGML_USE_METAL
3322
3446
  if (params.n_gpu_layers > 0) {
3323
3447
  // this allocates all Metal resources and memory buffers
3324
- ctx->ctx_metal = ggml_metal_init(1);
3325
3448
 
3326
3449
  void * data_ptr = NULL;
3327
3450
  size_t data_size = 0;
@@ -3336,13 +3459,13 @@ struct llama_context * llama_new_context_with_model(
3336
3459
 
3337
3460
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
3338
3461
 
3339
- fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3462
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
3340
3463
 
3341
- #define LLAMA_METAL_CHECK_BUF(result) \
3342
- if (!(result)) { \
3343
- fprintf(stderr, "%s: failed to add buffer\n", __func__); \
3344
- llama_free(ctx); \
3345
- return NULL; \
3464
+ #define LLAMA_METAL_CHECK_BUF(result) \
3465
+ if (!(result)) { \
3466
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
3467
+ llama_free(ctx); \
3468
+ return NULL; \
3346
3469
  }
3347
3470
 
3348
3471
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
@@ -3350,8 +3473,7 @@ struct llama_context * llama_new_context_with_model(
3350
3473
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
3351
3474
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
3352
3475
 
3353
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3354
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3476
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
3355
3477
  #undef LLAMA_METAL_CHECK_BUF
3356
3478
  }
3357
3479
  #endif
@@ -3396,19 +3518,19 @@ int llama_model_quantize(
3396
3518
  llama_model_quantize_internal(fname_inp, fname_out, params);
3397
3519
  return 0;
3398
3520
  } catch (const std::exception & err) {
3399
- fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
3521
+ LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
3400
3522
  return 1;
3401
3523
  }
3402
3524
  }
3403
3525
 
3404
3526
  int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
3405
- fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
3527
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
3406
3528
 
3407
3529
  const int64_t t_start_lora_us = ggml_time_us();
3408
3530
 
3409
3531
  auto fin = std::ifstream(path_lora, std::ios::binary);
3410
3532
  if (!fin) {
3411
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
3533
+ LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
3412
3534
  return 1;
3413
3535
  }
3414
3536
 
@@ -3417,14 +3539,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3417
3539
  uint32_t magic;
3418
3540
  fin.read((char *) &magic, sizeof(magic));
3419
3541
  if (magic != LLAMA_FILE_MAGIC_GGLA) {
3420
- fprintf(stderr, "%s: bad file magic\n", __func__);
3542
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
3421
3543
  return 1;
3422
3544
  }
3423
3545
  uint32_t format_version;
3424
3546
  fin.read((char *) &format_version, sizeof(format_version));
3425
3547
 
3426
3548
  if (format_version != 1) {
3427
- fprintf(stderr, "%s: unsupported file version\n", __func__ );
3549
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
3428
3550
  return 1;
3429
3551
  }
3430
3552
  }
@@ -3435,7 +3557,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3435
3557
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
3436
3558
  float scaling = (float)lora_alpha / (float)lora_r;
3437
3559
 
3438
- fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
3560
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
3439
3561
 
3440
3562
 
3441
3563
  // create a temporary ggml context to store the lora tensors
@@ -3461,7 +3583,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3461
3583
  ggml_context * base_ctx = NULL;
3462
3584
  llama_buffer base_buf;
3463
3585
  if (path_base_model) {
3464
- fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
3586
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
3465
3587
  model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
3466
3588
 
3467
3589
  size_t ctx_size;
@@ -3518,17 +3640,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3518
3640
  const std::string lora_suffix = ".lora";
3519
3641
  size_t pos = name.rfind(lora_suffix);
3520
3642
  if (pos == std::string::npos) {
3521
- fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
3643
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
3522
3644
  return 1;
3523
3645
  }
3524
3646
 
3525
3647
  std::string lora_type = name.substr(pos + lora_suffix.length());
3526
3648
  std::string base_name = name;
3527
3649
  base_name.erase(pos);
3528
- // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
3650
+ // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
3529
3651
 
3530
3652
  if (model_tensors.find(base_name) == model_tensors.end()) {
3531
- fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
3653
+ LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
3532
3654
  return 1;
3533
3655
  }
3534
3656
 
@@ -3539,7 +3661,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3539
3661
  case 1: wtype = GGML_TYPE_F16; break;
3540
3662
  default:
3541
3663
  {
3542
- fprintf(stderr, "%s: invalid tensor data type '%d'\n",
3664
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
3543
3665
  __func__, ftype);
3544
3666
  return false;
3545
3667
  }
@@ -3549,7 +3671,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3549
3671
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
3550
3672
  }
3551
3673
  else {
3552
- fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
3674
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
3553
3675
  return 1;
3554
3676
  }
3555
3677
  ggml_set_name(lora_tensor, "lora_tensor");
@@ -3587,7 +3709,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3587
3709
  if (model_loader) {
3588
3710
  // load from base model
3589
3711
  if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
3590
- fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
3712
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
3591
3713
  return 1;
3592
3714
  }
3593
3715
  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
@@ -3603,8 +3725,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3603
3725
 
3604
3726
  if (ggml_is_quantized(base_t->type)) {
3605
3727
  if (!warned) {
3606
- fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
3607
- "use a f16 or f32 base model with --lora-base\n", __func__);
3728
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
3729
+ "use a f16 or f32 base model with --lora-base\n", __func__);
3608
3730
  warned = true;
3609
3731
  }
3610
3732
  }
@@ -3618,8 +3740,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3618
3740
  ggml_set_name(loraB, "loraB");
3619
3741
 
3620
3742
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3621
- fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
3622
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
3743
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
3744
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
3623
3745
  return 1;
3624
3746
  }
3625
3747
 
@@ -3664,7 +3786,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3664
3786
 
3665
3787
  n_tensors++;
3666
3788
  if (n_tensors % 4 == 0) {
3667
- fprintf(stderr, ".");
3789
+ LLAMA_LOG_INFO(".");
3668
3790
  }
3669
3791
  }
3670
3792
  }
@@ -3676,7 +3798,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3676
3798
  }
3677
3799
 
3678
3800
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
3679
- fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
3801
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
3680
3802
 
3681
3803
  return 0;
3682
3804
  }
@@ -3685,7 +3807,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3685
3807
  try {
3686
3808
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3687
3809
  } catch (const std::exception & err) {
3688
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3810
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
3689
3811
  return 1;
3690
3812
  }
3691
3813
  }
@@ -3694,7 +3816,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
3694
3816
  try {
3695
3817
  return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3696
3818
  } catch (const std::exception & err) {
3697
- fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3819
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
3698
3820
  return 1;
3699
3821
  }
3700
3822
  }
@@ -3743,10 +3865,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3743
3865
  return s_total;
3744
3866
  }
3745
3867
 
3746
- // Copies the state to the specified destination address
3747
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3748
- uint8_t * out = dst;
3749
-
3868
+ /** copy state data into either a buffer or file depending on the passed in context
3869
+ *
3870
+ * file context:
3871
+ * llama_file file("/path", "wb");
3872
+ * llama_data_file_context data_ctx(&file);
3873
+ * llama_copy_state_data(ctx, &data_ctx);
3874
+ *
3875
+ * buffer context:
3876
+ * std::vector<uint8_t> buf(max_size, 0);
3877
+ * llama_data_buffer_context data_ctx(&buf.data());
3878
+ * llama_copy_state_data(ctx, &data_ctx);
3879
+ *
3880
+ */
3881
+ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
3750
3882
  // copy rng
3751
3883
  {
3752
3884
  std::stringstream rng_ss;
@@ -3758,8 +3890,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3758
3890
  memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
3759
3891
  memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
3760
3892
 
3761
- memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
3762
- memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
3893
+ data_ctx->write(&rng_size, sizeof(rng_size));
3894
+ data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
3763
3895
  }
3764
3896
 
3765
3897
  // copy logits
@@ -3767,25 +3899,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3767
3899
  const size_t logits_cap = ctx->logits.capacity();
3768
3900
  const size_t logits_size = ctx->logits.size();
3769
3901
 
3770
- memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
3771
- memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
3902
+ data_ctx->write(&logits_cap, sizeof(logits_cap));
3903
+ data_ctx->write(&logits_size, sizeof(logits_size));
3772
3904
 
3773
3905
  if (logits_size) {
3774
- memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
3906
+ data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
3775
3907
  }
3776
3908
 
3777
- out += logits_cap * sizeof(float);
3909
+ // If there is a gap between the size and the capacity, write padding
3910
+ size_t padding_size = (logits_cap - logits_size) * sizeof(float);
3911
+ if (padding_size > 0) {
3912
+ std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
3913
+ data_ctx->write(padding.data(), padding_size);
3914
+ }
3778
3915
  }
3779
3916
 
3780
3917
  // copy embeddings
3781
3918
  {
3782
3919
  const size_t embedding_size = ctx->embedding.size();
3783
3920
 
3784
- memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
3921
+ data_ctx->write(&embedding_size, sizeof(embedding_size));
3785
3922
 
3786
3923
  if (embedding_size) {
3787
- memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
3788
- out += embedding_size * sizeof(float);
3924
+ data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
3789
3925
  }
3790
3926
  }
3791
3927
 
@@ -3800,8 +3936,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3800
3936
  const size_t kv_size = kv_self.buf.size;
3801
3937
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
3802
3938
 
3803
- memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
3804
- memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
3939
+ data_ctx->write(&kv_size, sizeof(kv_size));
3940
+ data_ctx->write(&kv_ntok, sizeof(kv_ntok));
3805
3941
 
3806
3942
  if (kv_size) {
3807
3943
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -3810,12 +3946,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3810
3946
  ggml_cgraph gf{};
3811
3947
 
3812
3948
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3813
- kout3d->data = out;
3814
- out += ggml_nbytes(kout3d);
3949
+ std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
3950
+ kout3d->data = kout3d_data.data();
3815
3951
 
3816
3952
  ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
3817
- vout3d->data = out;
3818
- out += ggml_nbytes(vout3d);
3953
+ std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
3954
+ vout3d->data = vout3d_data.data();
3819
3955
 
3820
3956
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
3821
3957
  n_embd, kv_ntok, n_layer,
@@ -3830,15 +3966,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3830
3966
  ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3831
3967
 
3832
3968
  ggml_free(cpy_ctx);
3969
+
3970
+ // our data is now in the kout3d_data and vout3d_data buffers
3971
+ // write them to file
3972
+ data_ctx->write(kout3d_data.data(), kout3d_data.size());
3973
+ data_ctx->write(vout3d_data.data(), vout3d_data.size());
3833
3974
  }
3834
3975
  }
3976
+ }
3835
3977
 
3836
- const size_t written = out - dst;
3837
- const size_t max_size = llama_get_state_size(ctx);
3838
-
3839
- LLAMA_ASSERT(written <= max_size);
3978
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3979
+ llama_data_buffer_context data_ctx(dst);
3980
+ llama_copy_state_data_internal(ctx, &data_ctx);
3840
3981
 
3841
- return written;
3982
+ return data_ctx.get_size_written();
3842
3983
  }
3843
3984
 
3844
3985
  // Sets the state reading from the specified source address
@@ -3957,7 +4098,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3957
4098
  const uint32_t version = file.read_u32();
3958
4099
 
3959
4100
  if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
3960
- fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
4101
+ LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
3961
4102
  return false;
3962
4103
  }
3963
4104
 
@@ -3965,7 +4106,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3965
4106
  file.read_raw(&session_hparams, sizeof(llama_hparams));
3966
4107
 
3967
4108
  if (session_hparams != ctx->model.hparams) {
3968
- fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
4109
+ LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
3969
4110
  return false;
3970
4111
  }
3971
4112
  }
@@ -3975,7 +4116,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3975
4116
  const uint32_t n_token_count = file.read_u32();
3976
4117
 
3977
4118
  if (n_token_count > n_token_capacity) {
3978
- fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
4119
+ LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
3979
4120
  return false;
3980
4121
  }
3981
4122
 
@@ -3989,7 +4130,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
3989
4130
  const size_t n_state_size_max = llama_get_state_size(ctx);
3990
4131
 
3991
4132
  if (n_state_size_cur > n_state_size_max) {
3992
- fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
4133
+ LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
3993
4134
  return false;
3994
4135
  }
3995
4136
 
@@ -4006,7 +4147,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
4006
4147
  try {
4007
4148
  return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
4008
4149
  } catch (const std::exception & err) {
4009
- fprintf(stderr, "error loading session file: %s\n", err.what());
4150
+ LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
4010
4151
  return false;
4011
4152
  }
4012
4153
  }
@@ -4023,15 +4164,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
4023
4164
  file.write_u32((uint32_t) n_token_count);
4024
4165
  file.write_raw(tokens, sizeof(llama_token) * n_token_count);
4025
4166
 
4026
- // save the context state
4027
- {
4028
- const size_t n_state_size_max = llama_get_state_size(ctx);
4029
-
4030
- std::vector<uint8_t> state_data(n_state_size_max);
4031
- const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
4032
-
4033
- file.write_raw(state_data.data(), n_state_size_cur);
4034
- }
4167
+ // save the context state using stream saving
4168
+ llama_data_file_context data_ctx(&file);
4169
+ llama_copy_state_data_internal(ctx, &data_ctx);
4035
4170
 
4036
4171
  return true;
4037
4172
  }
@@ -4043,7 +4178,7 @@ int llama_eval(
4043
4178
  int n_past,
4044
4179
  int n_threads) {
4045
4180
  if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
4046
- fprintf(stderr, "%s: failed to eval\n", __func__);
4181
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
4047
4182
  return 1;
4048
4183
  }
4049
4184
 
@@ -4065,7 +4200,7 @@ int llama_eval_embd(
4065
4200
  int n_past,
4066
4201
  int n_threads) {
4067
4202
  if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
4068
- fprintf(stderr, "%s: failed to eval\n", __func__);
4203
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
4069
4204
  return 1;
4070
4205
  }
4071
4206
 
@@ -4086,7 +4221,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
4086
4221
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
4087
4222
 
4088
4223
  if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
4089
- fprintf(stderr, "%s: failed to eval\n", __func__);
4224
+ LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
4090
4225
  return 1;
4091
4226
  }
4092
4227
 
@@ -4102,7 +4237,7 @@ int llama_tokenize_with_model(
4102
4237
  auto res = llama_tokenize(model->vocab, text, add_bos);
4103
4238
 
4104
4239
  if (n_max_tokens < (int) res.size()) {
4105
- fprintf(stderr, "%s: too many tokens\n", __func__);
4240
+ LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
4106
4241
  return -((int) res.size());
4107
4242
  }
4108
4243
 
@@ -4146,6 +4281,10 @@ int llama_n_embd(const struct llama_context * ctx) {
4146
4281
  return ctx->model.hparams.n_embd;
4147
4282
  }
4148
4283
 
4284
+ int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
4285
+ return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_ftype_name(model->hparams.ftype));
4286
+ }
4287
+
4149
4288
  int llama_get_vocab_from_model(
4150
4289
  const struct llama_model * model,
4151
4290
  const char * * strings,
@@ -4219,15 +4358,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
4219
4358
  void llama_print_timings(struct llama_context * ctx) {
4220
4359
  const llama_timings timings = llama_get_timings(ctx);
4221
4360
 
4222
- fprintf(stderr, "\n");
4223
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
4224
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4361
+ LLAMA_LOG_INFO("\n");
4362
+ LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
4363
+ LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4225
4364
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
4226
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
4365
+ LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
4227
4366
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
4228
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4367
+ LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
4229
4368
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
4230
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
4369
+ LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
4231
4370
  }
4232
4371
 
4233
4372
  void llama_reset_timings(struct llama_context * ctx) {
@@ -4263,3 +4402,44 @@ const char * llama_print_system_info(void) {
4263
4402
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
4264
4403
  return ctx->model.tensors_by_name;
4265
4404
  }
4405
+
4406
+
4407
+ void llama_log_set(llama_log_callback log_callback, void * user_data) {
4408
+ g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
4409
+ g_state.log_callback_user_data = user_data;
4410
+ }
4411
+
4412
+ #if defined(_MSC_VER) && !defined(vsnprintf)
4413
+ #define vsnprintf _vsnprintf
4414
+ #endif
4415
+
4416
+ static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
4417
+ va_list args_copy;
4418
+ va_copy(args_copy, args);
4419
+ char buffer[128];
4420
+ int len = vsnprintf(buffer, 128, format, args);
4421
+ if (len < 128) {
4422
+ g_state.log_callback(level, buffer, g_state.log_callback_user_data);
4423
+ } else {
4424
+ char* buffer2 = new char[len+1];
4425
+ vsnprintf(buffer2, len+1, format, args_copy);
4426
+ buffer2[len] = 0;
4427
+ g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
4428
+ delete[] buffer2;
4429
+ }
4430
+ va_end(args_copy);
4431
+ }
4432
+
4433
+ static void llama_log_internal(llama_log_level level, const char * format, ...) {
4434
+ va_list args;
4435
+ va_start(args, format);
4436
+ llama_log_internal_v(level, format, args);
4437
+ va_end(args);
4438
+ }
4439
+
4440
+ static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
4441
+ (void) level;
4442
+ (void) user_data;
4443
+ fputs(text, stderr);
4444
+ fflush(stderr);
4445
+ }