llama_cpp 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,15 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #ifdef GGML_QKK_64
25
+ #define QK_K 64
26
+ #else
27
+ #define QK_K 256
28
+ #endif
29
+ #endif
30
+ #endif
22
31
 
23
32
  #include <array>
24
33
  #include <ctime>
@@ -40,6 +49,10 @@
40
49
  #include <sstream>
41
50
  #include <numeric>
42
51
 
52
+ #if defined(_MSC_VER)
53
+ #pragma warning(disable: 4244 4267) // possible loss of data
54
+ #endif
55
+
43
56
  #define LLAMA_USE_SCRATCH
44
57
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
58
 
@@ -173,6 +186,19 @@ struct llama_kv_cache {
173
186
  }
174
187
  };
175
188
 
189
+ struct llama_vocab {
190
+ using id = int32_t;
191
+ using token = std::string;
192
+
193
+ struct token_score {
194
+ token tok;
195
+ float score;
196
+ };
197
+
198
+ std::unordered_map<token, id> token_to_id;
199
+ std::vector<token_score> id_to_token;
200
+ };
201
+
176
202
  struct llama_model {
177
203
  e_model type = MODEL_UNKNOWN;
178
204
 
@@ -189,10 +215,6 @@ struct llama_model {
189
215
  // context
190
216
  struct ggml_context * ctx = NULL;
191
217
 
192
- // key + value cache for the self attention
193
- // TODO: move to llama_state
194
- struct llama_kv_cache kv_self;
195
-
196
218
  // the model memory buffer
197
219
  llama_ctx_buffer buf;
198
220
 
@@ -206,6 +228,11 @@ struct llama_model {
206
228
  // for quantize-stats only
207
229
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
208
230
 
231
+ int64_t t_load_us = 0;
232
+ int64_t t_start_us = 0;
233
+
234
+ llama_vocab vocab;
235
+
209
236
  ~llama_model() {
210
237
  if (ctx) {
211
238
  ggml_free(ctx);
@@ -224,24 +251,11 @@ struct llama_model {
224
251
  }
225
252
  };
226
253
 
227
- struct llama_vocab {
228
- using id = int32_t;
229
- using token = std::string;
230
-
231
- struct token_score {
232
- token tok;
233
- float score;
234
- };
235
-
236
- std::unordered_map<token, id> token_to_id;
237
- std::vector<token_score> id_to_token;
238
- };
239
-
240
254
  struct llama_context {
255
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
+
241
257
  std::mt19937 rng;
242
258
 
243
- int64_t t_load_us = 0;
244
- int64_t t_start_us = 0;
245
259
  bool has_evaluated_once = false;
246
260
 
247
261
  int64_t t_sample_us = 0;
@@ -252,8 +266,16 @@ struct llama_context {
252
266
  int32_t n_eval = 0; // number of eval calls
253
267
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
254
268
 
255
- llama_model model;
256
- llama_vocab vocab;
269
+ const llama_model & model;
270
+ const llama_vocab & vocab;
271
+
272
+ bool model_owner = false;
273
+
274
+ int64_t t_load_us;
275
+ int64_t t_start_us;
276
+
277
+ // key + value cache for the self attention
278
+ struct llama_kv_cache kv_self;
257
279
 
258
280
  size_t mem_per_token = 0;
259
281
 
@@ -752,7 +774,7 @@ struct llama_model_loader {
752
774
  }
753
775
 
754
776
  if (use_mmap) {
755
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
777
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
756
778
  if (lmlock) {
757
779
  lmlock->init(mapping->addr);
758
780
  }
@@ -882,6 +904,7 @@ static bool kv_cache_init(
882
904
  const int64_t n_elements = n_embd*n_mem;
883
905
 
884
906
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
907
+ cache.n = 0;
885
908
 
886
909
  struct ggml_init_params params;
887
910
  params.mem_size = cache.buf.size;
@@ -900,6 +923,7 @@ static bool kv_cache_init(
900
923
  ggml_set_name(cache.k, "cache_k");
901
924
  ggml_set_name(cache.v, "cache_v");
902
925
 
926
+ (void) n_gpu_layers;
903
927
  #ifdef GGML_USE_CUBLAS
904
928
  if (n_gpu_layers > n_layer + 1) {
905
929
  ggml_cuda_assign_buffers_no_scratch(cache.v);
@@ -914,21 +938,21 @@ static bool kv_cache_init(
914
938
 
915
939
  struct llama_context_params llama_context_default_params() {
916
940
  struct llama_context_params result = {
941
+ /*.seed =*/ -1,
917
942
  /*.n_ctx =*/ 512,
918
943
  /*.n_batch =*/ 512,
919
944
  /*.gpu_layers =*/ 0,
920
945
  /*.main_gpu =*/ 0,
921
946
  /*.tensor_split =*/ {0},
947
+ /*.progress_callback =*/ nullptr,
948
+ /*.progress_callback_user_data =*/ nullptr,
922
949
  /*.low_vram =*/ false,
923
- /*.seed =*/ -1,
924
950
  /*.f16_kv =*/ true,
925
951
  /*.logits_all =*/ false,
926
952
  /*.vocab_only =*/ false,
927
953
  /*.use_mmap =*/ true,
928
954
  /*.use_mlock =*/ false,
929
955
  /*.embedding =*/ false,
930
- /*.progress_callback =*/ nullptr,
931
- /*.progress_callback_user_data =*/ nullptr,
932
956
  };
933
957
 
934
958
  return result;
@@ -953,7 +977,7 @@ bool llama_mlock_supported() {
953
977
  return llama_mlock::SUPPORTED;
954
978
  }
955
979
 
956
- void llama_init_backend() {
980
+ void llama_init_backend(bool numa) {
957
981
  ggml_time_init();
958
982
 
959
983
  // needed to initialize f16 tables
@@ -962,6 +986,10 @@ void llama_init_backend() {
962
986
  struct ggml_context * ctx = ggml_init(params);
963
987
  ggml_free(ctx);
964
988
  }
989
+
990
+ if (numa) {
991
+ ggml_numa_init();
992
+ }
965
993
  }
966
994
 
967
995
  int64_t llama_time_us() {
@@ -1022,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
1022
1050
 
1023
1051
  static void llama_model_load_internal(
1024
1052
  const std::string & fname,
1025
- llama_context & lctx,
1053
+ llama_model & model,
1054
+ llama_vocab & vocab,
1026
1055
  int n_ctx,
1027
1056
  int n_batch,
1028
1057
  int n_gpu_layers,
@@ -1036,12 +1065,11 @@ static void llama_model_load_internal(
1036
1065
  llama_progress_callback progress_callback,
1037
1066
  void * progress_callback_user_data) {
1038
1067
 
1039
- lctx.t_start_us = ggml_time_us();
1068
+ model.t_start_us = ggml_time_us();
1040
1069
 
1041
1070
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1042
1071
 
1043
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1044
- auto & model = lctx.model;
1072
+ vocab = std::move(ml->file_loaders.at(0)->vocab);
1045
1073
  model.hparams = ml->file_loaders.at(0)->hparams;
1046
1074
  model.n_gpu_layers = n_gpu_layers;
1047
1075
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1111,15 +1139,15 @@ static void llama_model_load_internal(
1111
1139
 
1112
1140
  // create the ggml context
1113
1141
  {
1114
- lctx.model.buf.resize(ctx_size);
1142
+ model.buf.resize(ctx_size);
1115
1143
  if (use_mlock) {
1116
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1117
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
1144
+ model.mlock_buf.init(model.buf.addr);
1145
+ model.mlock_buf.grow_to(model.buf.size);
1118
1146
  }
1119
1147
 
1120
1148
  struct ggml_init_params params = {
1121
- /*.mem_size =*/ lctx.model.buf.size,
1122
- /*.mem_buffer =*/ lctx.model.buf.addr,
1149
+ /*.mem_size =*/ model.buf.size,
1150
+ /*.mem_buffer =*/ model.buf.addr,
1123
1151
  /*.no_alloc =*/ ml->use_mmap,
1124
1152
  };
1125
1153
 
@@ -1249,7 +1277,7 @@ static void llama_model_load_internal(
1249
1277
  vram_scratch = n_batch * MB;
1250
1278
  ggml_cuda_set_scratch_size(vram_scratch);
1251
1279
  if (n_gpu_layers > 0) {
1252
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1280
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1253
1281
  __func__, vram_scratch / MB);
1254
1282
  }
1255
1283
  }
@@ -1300,7 +1328,7 @@ static void llama_model_load_internal(
1300
1328
  }
1301
1329
  #endif
1302
1330
 
1303
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1331
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1304
1332
 
1305
1333
  if (progress_callback) {
1306
1334
  progress_callback(1.0f, progress_callback_user_data);
@@ -1310,12 +1338,13 @@ static void llama_model_load_internal(
1310
1338
 
1311
1339
  // loading time will be recalculate after the first eval, so
1312
1340
  // we take page faults deferred by mmap() into consideration
1313
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1341
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1314
1342
  }
1315
1343
 
1316
1344
  static bool llama_model_load(
1317
1345
  const std::string & fname,
1318
- llama_context & lctx,
1346
+ llama_model & model,
1347
+ llama_vocab & vocab,
1319
1348
  int n_ctx,
1320
1349
  int n_batch,
1321
1350
  int n_gpu_layers,
@@ -1329,7 +1358,7 @@ static bool llama_model_load(
1329
1358
  llama_progress_callback progress_callback,
1330
1359
  void *progress_callback_user_data) {
1331
1360
  try {
1332
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1361
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1333
1362
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1334
1363
  return true;
1335
1364
  } catch (const std::exception & err) {
@@ -1367,7 +1396,7 @@ static bool llama_eval_internal(
1367
1396
  const auto & model = lctx.model;
1368
1397
  const auto & hparams = model.hparams;
1369
1398
 
1370
- const auto & kv_self = model.kv_self;
1399
+ const auto & kv_self = lctx.kv_self;
1371
1400
 
1372
1401
  LLAMA_ASSERT(!!kv_self.ctx);
1373
1402
 
@@ -1462,11 +1491,11 @@ static bool llama_eval_internal(
1462
1491
  offload_func_kq(tmpq);
1463
1492
  ggml_set_name(tmpq, "tmpq");
1464
1493
 
1465
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1494
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1466
1495
  offload_func_kq(Kcur);
1467
1496
  ggml_set_name(Kcur, "Kcur");
1468
1497
 
1469
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1498
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1470
1499
  offload_func_kq(Qcur);
1471
1500
  ggml_set_name(Qcur, "Qcur");
1472
1501
 
@@ -1609,7 +1638,7 @@ static bool llama_eval_internal(
1609
1638
  model.layers[il].w1,
1610
1639
  cur);
1611
1640
  offload_func(cur);
1612
- ggml_set_name(cur, "result_w2");
1641
+ ggml_set_name(cur, "result_w1");
1613
1642
 
1614
1643
  // SILU activation
1615
1644
  cur = ggml_silu(ctx0, cur);
@@ -1646,15 +1675,11 @@ static bool llama_eval_internal(
1646
1675
  {
1647
1676
  cur = ggml_rms_norm(ctx0, inpL);
1648
1677
  offload_func_nr(cur);
1649
- ggml_set_name(cur, "rms_norm_inpL");
1650
-
1651
- cur = ggml_rms_norm(ctx0, cur);
1652
- offload_func_nr(cur);
1653
- ggml_set_name(cur, "rms_norm_after");
1678
+ ggml_set_name(cur, "rms_norm_2");
1654
1679
 
1655
1680
  // cur = cur*norm(broadcasted)
1656
1681
  cur = ggml_mul(ctx0, cur, model.norm);
1657
- offload_func_nr(cur);
1682
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1658
1683
  ggml_set_name(cur, "result_norm");
1659
1684
 
1660
1685
  embeddings = cur;
@@ -1719,7 +1744,7 @@ static bool llama_eval_internal(
1719
1744
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1720
1745
 
1721
1746
  // update kv token count
1722
- lctx.model.kv_self.n = n_past + N;
1747
+ lctx.kv_self.n = n_past + N;
1723
1748
 
1724
1749
  // extract logits
1725
1750
  {
@@ -1998,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
1998
2023
  for (size_t i = 0; i < candidates->size; ++i) {
1999
2024
  cum_sum += candidates->data[i].p;
2000
2025
 
2001
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2002
- if (cum_sum > p && i >= min_keep) {
2003
- last_idx = i;
2026
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
2027
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
2028
+ if (cum_sum >= p && i + 1 >= min_keep) {
2029
+ last_idx = i + 1;
2004
2030
  break;
2005
2031
  }
2006
2032
  }
@@ -2452,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2452
2478
  std::vector<std::thread> workers;
2453
2479
  std::mutex mutex;
2454
2480
 
2481
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2482
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2483
+ };
2484
+
2455
2485
  size_t idx = 0;
2456
2486
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2457
2487
  llama_buffer read_data;
@@ -2485,21 +2515,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2485
2515
  } else {
2486
2516
  new_type = quantized_type;
2487
2517
  #ifdef GGML_USE_K_QUANTS
2518
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2519
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2520
+ int nx = tensor.ne.at(0);
2521
+ int ny = tensor.ne.at(1);
2522
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2523
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2524
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2525
+ fprintf(stderr, "========================================================================================\n\n");
2526
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2527
+ }
2528
+ }
2488
2529
  if (tensor.name == "output.weight") {
2489
- new_type = GGML_TYPE_Q6_K;
2530
+ int nx = tensor.ne.at(0);
2531
+ int ny = tensor.ne.at(1);
2532
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2533
+ new_type = GGML_TYPE_Q6_K;
2534
+ }
2490
2535
  } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2491
2536
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2492
2537
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2493
2538
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2494
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2495
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2539
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2540
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2541
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2496
2542
  ++i_attention_wv;
2497
2543
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2498
2544
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2499
2545
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2500
2546
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2501
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2502
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2547
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2548
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2503
2549
  ++i_feed_forward_w2;
2504
2550
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2505
2551
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2612,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2612
2658
  // interface implementation
2613
2659
  //
2614
2660
 
2615
- struct llama_context * llama_init_from_file(
2661
+ struct llama_model * llama_load_model_from_file(
2616
2662
  const char * path_model,
2617
2663
  struct llama_context_params params) {
2618
2664
  ggml_time_init();
2619
2665
 
2620
- llama_context * ctx = new llama_context;
2666
+ llama_model * model = new llama_model;
2667
+
2668
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
+
2670
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2671
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
+ delete model;
2674
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2675
+ return nullptr;
2676
+ }
2677
+
2678
+ return model;
2679
+ }
2680
+
2681
+ void llama_free_model(struct llama_model * model) {
2682
+ delete model;
2683
+ }
2684
+
2685
+ struct llama_context * llama_new_context_with_model(
2686
+ struct llama_model * model,
2687
+ struct llama_context_params params) {
2688
+
2689
+ if (!model) {
2690
+ return nullptr;
2691
+ }
2692
+
2693
+ llama_context * ctx = new llama_context(*model, model->vocab);
2621
2694
 
2622
2695
  if (params.seed < 0) {
2623
2696
  params.seed = time(NULL);
@@ -2645,24 +2718,16 @@ struct llama_context * llama_init_from_file(
2645
2718
 
2646
2719
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2647
2720
 
2648
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2649
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2650
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2651
- fprintf(stderr, "%s: failed to load model\n", __func__);
2652
- llama_free(ctx);
2653
- return nullptr;
2654
- }
2655
-
2656
2721
  // reserve memory for context buffers
2657
2722
  if (!params.vocab_only) {
2658
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2723
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2659
2724
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2660
2725
  llama_free(ctx);
2661
2726
  return nullptr;
2662
2727
  }
2663
2728
 
2664
2729
  {
2665
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2730
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2666
2731
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2667
2732
  }
2668
2733
 
@@ -2690,16 +2755,21 @@ struct llama_context * llama_init_from_file(
2690
2755
  // this allocates all Metal resources and memory buffers
2691
2756
  ctx->ctx_metal = ggml_metal_init();
2692
2757
 
2693
- void *data_ptr = NULL;
2758
+ void * data_ptr = NULL;
2694
2759
  size_t data_size = 0;
2760
+
2695
2761
  if (params.use_mmap) {
2696
- data_ptr = ctx->model.mapping->addr;
2697
- data_size= ctx->model.mapping->size;
2762
+ data_ptr = ctx->model.mapping->addr;
2763
+ data_size = ctx->model.mapping->size;
2698
2764
  } else {
2699
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2700
- data_size= ggml_get_mem_size(ctx->model.ctx);
2765
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2766
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2701
2767
  }
2702
2768
 
2769
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2770
+
2771
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2772
+
2703
2773
  #define LLAMA_METAL_CHECK_BUF(result) \
2704
2774
  if (!(result)) { \
2705
2775
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2707,12 +2777,13 @@ struct llama_context * llama_init_from_file(
2707
2777
  return NULL; \
2708
2778
  }
2709
2779
 
2710
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2711
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2780
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2712
2781
 
2713
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2714
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2715
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2782
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2783
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2784
+
2785
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2786
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
2716
2787
  #undef LLAMA_METAL_CHECK_BUF
2717
2788
  }
2718
2789
  #endif
@@ -2720,7 +2791,23 @@ struct llama_context * llama_init_from_file(
2720
2791
  return ctx;
2721
2792
  }
2722
2793
 
2794
+ struct llama_context * llama_init_from_file(
2795
+ const char * path_model,
2796
+ struct llama_context_params params) {
2797
+
2798
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2799
+ if (!model) {
2800
+ return nullptr;
2801
+ }
2802
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2803
+ ctx->model_owner = true;
2804
+ return ctx;
2805
+ }
2806
+
2723
2807
  void llama_free(struct llama_context * ctx) {
2808
+ if (ctx->model_owner) {
2809
+ delete &ctx->model;
2810
+ }
2724
2811
  delete ctx;
2725
2812
  }
2726
2813
 
@@ -2737,11 +2824,9 @@ int llama_model_quantize(
2737
2824
  }
2738
2825
  }
2739
2826
 
2740
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2827
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2741
2828
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2742
2829
 
2743
- auto & model = ctx->model;
2744
-
2745
2830
  const int64_t t_start_lora_us = ggml_time_us();
2746
2831
 
2747
2832
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2818,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2818
2903
 
2819
2904
  // maybe this should in llama_model_loader
2820
2905
  if (model_loader->use_mmap) {
2821
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2906
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2822
2907
  }
2823
2908
  }
2824
2909
 
@@ -2984,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2984
3069
 
2985
3070
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2986
3071
  try {
2987
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
3072
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3073
+ } catch (const std::exception & err) {
3074
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3075
+ return 1;
3076
+ }
3077
+ }
3078
+
3079
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
3080
+ try {
3081
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
2988
3082
  } catch (const std::exception & err) {
2989
3083
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2990
3084
  return 1;
@@ -2992,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2992
3086
  }
2993
3087
 
2994
3088
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2995
- return ctx->model.kv_self.n;
3089
+ return ctx->kv_self.n;
2996
3090
  }
2997
3091
 
2998
3092
  #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3017,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3017
3111
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3018
3112
  const size_t s_kv_size = sizeof(size_t);
3019
3113
  const size_t s_kv_ntok = sizeof(int);
3020
- const size_t s_kv = ctx->model.kv_self.buf.size;
3114
+ const size_t s_kv = ctx->kv_self.buf.size;
3021
3115
 
3022
3116
  const size_t s_total = (
3023
3117
  + s_rng_size
@@ -3083,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3083
3177
 
3084
3178
  // copy kv cache
3085
3179
  {
3086
- const auto & kv_self = ctx->model.kv_self;
3180
+ const auto & kv_self = ctx->kv_self;
3087
3181
  const auto & hparams = ctx->model.hparams;
3088
3182
  const int n_layer = hparams.n_layer;
3089
3183
  const int n_embd = hparams.n_embd;
@@ -3098,9 +3192,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3098
3192
  if (kv_size) {
3099
3193
  const size_t elt_size = ggml_element_size(kv_self.k);
3100
3194
 
3101
- char buffer[4096];
3102
-
3103
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3195
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3104
3196
  ggml_cgraph gf{};
3105
3197
  gf.n_threads = 1;
3106
3198
 
@@ -3189,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3189
3281
 
3190
3282
  // set kv cache
3191
3283
  {
3192
- const auto & kv_self = ctx->model.kv_self;
3284
+ const auto & kv_self = ctx->kv_self;
3193
3285
  const auto & hparams = ctx->model.hparams;
3194
3286
  const int n_layer = hparams.n_layer;
3195
3287
  const int n_embd = hparams.n_embd;
@@ -3206,9 +3298,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3206
3298
 
3207
3299
  const size_t elt_size = ggml_element_size(kv_self.k);
3208
3300
 
3209
- char buffer[4096];
3210
-
3211
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3301
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3212
3302
  ggml_cgraph gf{};
3213
3303
  gf.n_threads = 1;
3214
3304
 
@@ -3235,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3235
3325
  ggml_free(cpy_ctx);
3236
3326
  }
3237
3327
 
3238
- ctx->model.kv_self.n = kv_ntok;
3328
+ ctx->kv_self.n = kv_ntok;
3239
3329
  }
3240
3330
 
3241
3331
  const size_t nread = inp - src;
@@ -3443,9 +3533,12 @@ void llama_print_timings(struct llama_context * ctx) {
3443
3533
 
3444
3534
  fprintf(stderr, "\n");
3445
3535
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3446
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3447
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3448
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
3536
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3537
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3538
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3539
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3540
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3541
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3449
3542
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3450
3543
  }
3451
3544
 
@@ -3479,6 +3572,6 @@ const char * llama_print_system_info(void) {
3479
3572
  }
3480
3573
 
3481
3574
  // For internal test use
3482
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3575
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3483
3576
  return ctx->model.tensors_by_name;
3484
3577
  }