llama_cpp 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,6 +19,15 @@
19
19
  #ifdef GGML_USE_METAL
20
20
  #include "ggml-metal.h"
21
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #ifdef GGML_QKK_64
25
+ #define QK_K 64
26
+ #else
27
+ #define QK_K 256
28
+ #endif
29
+ #endif
30
+ #endif
22
31
 
23
32
  #include <array>
24
33
  #include <ctime>
@@ -40,6 +49,10 @@
40
49
  #include <sstream>
41
50
  #include <numeric>
42
51
 
52
+ #if defined(_MSC_VER)
53
+ #pragma warning(disable: 4244 4267) // possible loss of data
54
+ #endif
55
+
43
56
  #define LLAMA_USE_SCRATCH
44
57
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
58
 
@@ -173,6 +186,19 @@ struct llama_kv_cache {
173
186
  }
174
187
  };
175
188
 
189
+ struct llama_vocab {
190
+ using id = int32_t;
191
+ using token = std::string;
192
+
193
+ struct token_score {
194
+ token tok;
195
+ float score;
196
+ };
197
+
198
+ std::unordered_map<token, id> token_to_id;
199
+ std::vector<token_score> id_to_token;
200
+ };
201
+
176
202
  struct llama_model {
177
203
  e_model type = MODEL_UNKNOWN;
178
204
 
@@ -189,10 +215,6 @@ struct llama_model {
189
215
  // context
190
216
  struct ggml_context * ctx = NULL;
191
217
 
192
- // key + value cache for the self attention
193
- // TODO: move to llama_state
194
- struct llama_kv_cache kv_self;
195
-
196
218
  // the model memory buffer
197
219
  llama_ctx_buffer buf;
198
220
 
@@ -206,6 +228,11 @@ struct llama_model {
206
228
  // for quantize-stats only
207
229
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
208
230
 
231
+ int64_t t_load_us = 0;
232
+ int64_t t_start_us = 0;
233
+
234
+ llama_vocab vocab;
235
+
209
236
  ~llama_model() {
210
237
  if (ctx) {
211
238
  ggml_free(ctx);
@@ -224,24 +251,11 @@ struct llama_model {
224
251
  }
225
252
  };
226
253
 
227
- struct llama_vocab {
228
- using id = int32_t;
229
- using token = std::string;
230
-
231
- struct token_score {
232
- token tok;
233
- float score;
234
- };
235
-
236
- std::unordered_map<token, id> token_to_id;
237
- std::vector<token_score> id_to_token;
238
- };
239
-
240
254
  struct llama_context {
255
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
+
241
257
  std::mt19937 rng;
242
258
 
243
- int64_t t_load_us = 0;
244
- int64_t t_start_us = 0;
245
259
  bool has_evaluated_once = false;
246
260
 
247
261
  int64_t t_sample_us = 0;
@@ -252,8 +266,16 @@ struct llama_context {
252
266
  int32_t n_eval = 0; // number of eval calls
253
267
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
254
268
 
255
- llama_model model;
256
- llama_vocab vocab;
269
+ const llama_model & model;
270
+ const llama_vocab & vocab;
271
+
272
+ bool model_owner = false;
273
+
274
+ int64_t t_load_us;
275
+ int64_t t_start_us;
276
+
277
+ // key + value cache for the self attention
278
+ struct llama_kv_cache kv_self;
257
279
 
258
280
  size_t mem_per_token = 0;
259
281
 
@@ -752,7 +774,7 @@ struct llama_model_loader {
752
774
  }
753
775
 
754
776
  if (use_mmap) {
755
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
777
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
756
778
  if (lmlock) {
757
779
  lmlock->init(mapping->addr);
758
780
  }
@@ -882,6 +904,7 @@ static bool kv_cache_init(
882
904
  const int64_t n_elements = n_embd*n_mem;
883
905
 
884
906
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
907
+ cache.n = 0;
885
908
 
886
909
  struct ggml_init_params params;
887
910
  params.mem_size = cache.buf.size;
@@ -900,6 +923,7 @@ static bool kv_cache_init(
900
923
  ggml_set_name(cache.k, "cache_k");
901
924
  ggml_set_name(cache.v, "cache_v");
902
925
 
926
+ (void) n_gpu_layers;
903
927
  #ifdef GGML_USE_CUBLAS
904
928
  if (n_gpu_layers > n_layer + 1) {
905
929
  ggml_cuda_assign_buffers_no_scratch(cache.v);
@@ -914,21 +938,21 @@ static bool kv_cache_init(
914
938
 
915
939
  struct llama_context_params llama_context_default_params() {
916
940
  struct llama_context_params result = {
941
+ /*.seed =*/ -1,
917
942
  /*.n_ctx =*/ 512,
918
943
  /*.n_batch =*/ 512,
919
944
  /*.gpu_layers =*/ 0,
920
945
  /*.main_gpu =*/ 0,
921
946
  /*.tensor_split =*/ {0},
947
+ /*.progress_callback =*/ nullptr,
948
+ /*.progress_callback_user_data =*/ nullptr,
922
949
  /*.low_vram =*/ false,
923
- /*.seed =*/ -1,
924
950
  /*.f16_kv =*/ true,
925
951
  /*.logits_all =*/ false,
926
952
  /*.vocab_only =*/ false,
927
953
  /*.use_mmap =*/ true,
928
954
  /*.use_mlock =*/ false,
929
955
  /*.embedding =*/ false,
930
- /*.progress_callback =*/ nullptr,
931
- /*.progress_callback_user_data =*/ nullptr,
932
956
  };
933
957
 
934
958
  return result;
@@ -953,7 +977,7 @@ bool llama_mlock_supported() {
953
977
  return llama_mlock::SUPPORTED;
954
978
  }
955
979
 
956
- void llama_init_backend() {
980
+ void llama_init_backend(bool numa) {
957
981
  ggml_time_init();
958
982
 
959
983
  // needed to initialize f16 tables
@@ -962,6 +986,10 @@ void llama_init_backend() {
962
986
  struct ggml_context * ctx = ggml_init(params);
963
987
  ggml_free(ctx);
964
988
  }
989
+
990
+ if (numa) {
991
+ ggml_numa_init();
992
+ }
965
993
  }
966
994
 
967
995
  int64_t llama_time_us() {
@@ -1022,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
1022
1050
 
1023
1051
  static void llama_model_load_internal(
1024
1052
  const std::string & fname,
1025
- llama_context & lctx,
1053
+ llama_model & model,
1054
+ llama_vocab & vocab,
1026
1055
  int n_ctx,
1027
1056
  int n_batch,
1028
1057
  int n_gpu_layers,
@@ -1036,12 +1065,11 @@ static void llama_model_load_internal(
1036
1065
  llama_progress_callback progress_callback,
1037
1066
  void * progress_callback_user_data) {
1038
1067
 
1039
- lctx.t_start_us = ggml_time_us();
1068
+ model.t_start_us = ggml_time_us();
1040
1069
 
1041
1070
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1042
1071
 
1043
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1044
- auto & model = lctx.model;
1072
+ vocab = std::move(ml->file_loaders.at(0)->vocab);
1045
1073
  model.hparams = ml->file_loaders.at(0)->hparams;
1046
1074
  model.n_gpu_layers = n_gpu_layers;
1047
1075
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1111,15 +1139,15 @@ static void llama_model_load_internal(
1111
1139
 
1112
1140
  // create the ggml context
1113
1141
  {
1114
- lctx.model.buf.resize(ctx_size);
1142
+ model.buf.resize(ctx_size);
1115
1143
  if (use_mlock) {
1116
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1117
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
1144
+ model.mlock_buf.init(model.buf.addr);
1145
+ model.mlock_buf.grow_to(model.buf.size);
1118
1146
  }
1119
1147
 
1120
1148
  struct ggml_init_params params = {
1121
- /*.mem_size =*/ lctx.model.buf.size,
1122
- /*.mem_buffer =*/ lctx.model.buf.addr,
1149
+ /*.mem_size =*/ model.buf.size,
1150
+ /*.mem_buffer =*/ model.buf.addr,
1123
1151
  /*.no_alloc =*/ ml->use_mmap,
1124
1152
  };
1125
1153
 
@@ -1249,7 +1277,7 @@ static void llama_model_load_internal(
1249
1277
  vram_scratch = n_batch * MB;
1250
1278
  ggml_cuda_set_scratch_size(vram_scratch);
1251
1279
  if (n_gpu_layers > 0) {
1252
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1280
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1253
1281
  __func__, vram_scratch / MB);
1254
1282
  }
1255
1283
  }
@@ -1300,7 +1328,7 @@ static void llama_model_load_internal(
1300
1328
  }
1301
1329
  #endif
1302
1330
 
1303
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1331
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1304
1332
 
1305
1333
  if (progress_callback) {
1306
1334
  progress_callback(1.0f, progress_callback_user_data);
@@ -1310,12 +1338,13 @@ static void llama_model_load_internal(
1310
1338
 
1311
1339
  // loading time will be recalculate after the first eval, so
1312
1340
  // we take page faults deferred by mmap() into consideration
1313
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1341
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1314
1342
  }
1315
1343
 
1316
1344
  static bool llama_model_load(
1317
1345
  const std::string & fname,
1318
- llama_context & lctx,
1346
+ llama_model & model,
1347
+ llama_vocab & vocab,
1319
1348
  int n_ctx,
1320
1349
  int n_batch,
1321
1350
  int n_gpu_layers,
@@ -1329,7 +1358,7 @@ static bool llama_model_load(
1329
1358
  llama_progress_callback progress_callback,
1330
1359
  void *progress_callback_user_data) {
1331
1360
  try {
1332
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1361
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1333
1362
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1334
1363
  return true;
1335
1364
  } catch (const std::exception & err) {
@@ -1367,7 +1396,7 @@ static bool llama_eval_internal(
1367
1396
  const auto & model = lctx.model;
1368
1397
  const auto & hparams = model.hparams;
1369
1398
 
1370
- const auto & kv_self = model.kv_self;
1399
+ const auto & kv_self = lctx.kv_self;
1371
1400
 
1372
1401
  LLAMA_ASSERT(!!kv_self.ctx);
1373
1402
 
@@ -1462,11 +1491,11 @@ static bool llama_eval_internal(
1462
1491
  offload_func_kq(tmpq);
1463
1492
  ggml_set_name(tmpq, "tmpq");
1464
1493
 
1465
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1494
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1466
1495
  offload_func_kq(Kcur);
1467
1496
  ggml_set_name(Kcur, "Kcur");
1468
1497
 
1469
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1498
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1470
1499
  offload_func_kq(Qcur);
1471
1500
  ggml_set_name(Qcur, "Qcur");
1472
1501
 
@@ -1609,7 +1638,7 @@ static bool llama_eval_internal(
1609
1638
  model.layers[il].w1,
1610
1639
  cur);
1611
1640
  offload_func(cur);
1612
- ggml_set_name(cur, "result_w2");
1641
+ ggml_set_name(cur, "result_w1");
1613
1642
 
1614
1643
  // SILU activation
1615
1644
  cur = ggml_silu(ctx0, cur);
@@ -1646,15 +1675,11 @@ static bool llama_eval_internal(
1646
1675
  {
1647
1676
  cur = ggml_rms_norm(ctx0, inpL);
1648
1677
  offload_func_nr(cur);
1649
- ggml_set_name(cur, "rms_norm_inpL");
1650
-
1651
- cur = ggml_rms_norm(ctx0, cur);
1652
- offload_func_nr(cur);
1653
- ggml_set_name(cur, "rms_norm_after");
1678
+ ggml_set_name(cur, "rms_norm_2");
1654
1679
 
1655
1680
  // cur = cur*norm(broadcasted)
1656
1681
  cur = ggml_mul(ctx0, cur, model.norm);
1657
- offload_func_nr(cur);
1682
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1658
1683
  ggml_set_name(cur, "result_norm");
1659
1684
 
1660
1685
  embeddings = cur;
@@ -1719,7 +1744,7 @@ static bool llama_eval_internal(
1719
1744
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1720
1745
 
1721
1746
  // update kv token count
1722
- lctx.model.kv_self.n = n_past + N;
1747
+ lctx.kv_self.n = n_past + N;
1723
1748
 
1724
1749
  // extract logits
1725
1750
  {
@@ -1998,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
1998
2023
  for (size_t i = 0; i < candidates->size; ++i) {
1999
2024
  cum_sum += candidates->data[i].p;
2000
2025
 
2001
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2002
- if (cum_sum > p && i >= min_keep) {
2003
- last_idx = i;
2026
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
2027
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
2028
+ if (cum_sum >= p && i + 1 >= min_keep) {
2029
+ last_idx = i + 1;
2004
2030
  break;
2005
2031
  }
2006
2032
  }
@@ -2452,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2452
2478
  std::vector<std::thread> workers;
2453
2479
  std::mutex mutex;
2454
2480
 
2481
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2482
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2483
+ };
2484
+
2455
2485
  size_t idx = 0;
2456
2486
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2457
2487
  llama_buffer read_data;
@@ -2485,21 +2515,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2485
2515
  } else {
2486
2516
  new_type = quantized_type;
2487
2517
  #ifdef GGML_USE_K_QUANTS
2518
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2519
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2520
+ int nx = tensor.ne.at(0);
2521
+ int ny = tensor.ne.at(1);
2522
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2523
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2524
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2525
+ fprintf(stderr, "========================================================================================\n\n");
2526
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2527
+ }
2528
+ }
2488
2529
  if (tensor.name == "output.weight") {
2489
- new_type = GGML_TYPE_Q6_K;
2530
+ int nx = tensor.ne.at(0);
2531
+ int ny = tensor.ne.at(1);
2532
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2533
+ new_type = GGML_TYPE_Q6_K;
2534
+ }
2490
2535
  } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2491
2536
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2492
2537
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2493
2538
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2494
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2495
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2539
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2540
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2541
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2496
2542
  ++i_attention_wv;
2497
2543
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2498
2544
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2499
2545
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2500
2546
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2501
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2502
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2547
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2548
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2503
2549
  ++i_feed_forward_w2;
2504
2550
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2505
2551
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2612,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2612
2658
  // interface implementation
2613
2659
  //
2614
2660
 
2615
- struct llama_context * llama_init_from_file(
2661
+ struct llama_model * llama_load_model_from_file(
2616
2662
  const char * path_model,
2617
2663
  struct llama_context_params params) {
2618
2664
  ggml_time_init();
2619
2665
 
2620
- llama_context * ctx = new llama_context;
2666
+ llama_model * model = new llama_model;
2667
+
2668
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
+
2670
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2671
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
+ delete model;
2674
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2675
+ return nullptr;
2676
+ }
2677
+
2678
+ return model;
2679
+ }
2680
+
2681
+ void llama_free_model(struct llama_model * model) {
2682
+ delete model;
2683
+ }
2684
+
2685
+ struct llama_context * llama_new_context_with_model(
2686
+ struct llama_model * model,
2687
+ struct llama_context_params params) {
2688
+
2689
+ if (!model) {
2690
+ return nullptr;
2691
+ }
2692
+
2693
+ llama_context * ctx = new llama_context(*model, model->vocab);
2621
2694
 
2622
2695
  if (params.seed < 0) {
2623
2696
  params.seed = time(NULL);
@@ -2645,24 +2718,16 @@ struct llama_context * llama_init_from_file(
2645
2718
 
2646
2719
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2647
2720
 
2648
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2649
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2650
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2651
- fprintf(stderr, "%s: failed to load model\n", __func__);
2652
- llama_free(ctx);
2653
- return nullptr;
2654
- }
2655
-
2656
2721
  // reserve memory for context buffers
2657
2722
  if (!params.vocab_only) {
2658
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2723
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2659
2724
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2660
2725
  llama_free(ctx);
2661
2726
  return nullptr;
2662
2727
  }
2663
2728
 
2664
2729
  {
2665
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2730
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2666
2731
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2667
2732
  }
2668
2733
 
@@ -2690,16 +2755,21 @@ struct llama_context * llama_init_from_file(
2690
2755
  // this allocates all Metal resources and memory buffers
2691
2756
  ctx->ctx_metal = ggml_metal_init();
2692
2757
 
2693
- void *data_ptr = NULL;
2758
+ void * data_ptr = NULL;
2694
2759
  size_t data_size = 0;
2760
+
2695
2761
  if (params.use_mmap) {
2696
- data_ptr = ctx->model.mapping->addr;
2697
- data_size= ctx->model.mapping->size;
2762
+ data_ptr = ctx->model.mapping->addr;
2763
+ data_size = ctx->model.mapping->size;
2698
2764
  } else {
2699
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2700
- data_size= ggml_get_mem_size(ctx->model.ctx);
2765
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2766
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2701
2767
  }
2702
2768
 
2769
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2770
+
2771
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2772
+
2703
2773
  #define LLAMA_METAL_CHECK_BUF(result) \
2704
2774
  if (!(result)) { \
2705
2775
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2707,12 +2777,13 @@ struct llama_context * llama_init_from_file(
2707
2777
  return NULL; \
2708
2778
  }
2709
2779
 
2710
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2711
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2780
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2712
2781
 
2713
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2714
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2715
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2782
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2783
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2784
+
2785
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2786
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
2716
2787
  #undef LLAMA_METAL_CHECK_BUF
2717
2788
  }
2718
2789
  #endif
@@ -2720,7 +2791,23 @@ struct llama_context * llama_init_from_file(
2720
2791
  return ctx;
2721
2792
  }
2722
2793
 
2794
+ struct llama_context * llama_init_from_file(
2795
+ const char * path_model,
2796
+ struct llama_context_params params) {
2797
+
2798
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2799
+ if (!model) {
2800
+ return nullptr;
2801
+ }
2802
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2803
+ ctx->model_owner = true;
2804
+ return ctx;
2805
+ }
2806
+
2723
2807
  void llama_free(struct llama_context * ctx) {
2808
+ if (ctx->model_owner) {
2809
+ delete &ctx->model;
2810
+ }
2724
2811
  delete ctx;
2725
2812
  }
2726
2813
 
@@ -2737,11 +2824,9 @@ int llama_model_quantize(
2737
2824
  }
2738
2825
  }
2739
2826
 
2740
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2827
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2741
2828
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2742
2829
 
2743
- auto & model = ctx->model;
2744
-
2745
2830
  const int64_t t_start_lora_us = ggml_time_us();
2746
2831
 
2747
2832
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2818,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2818
2903
 
2819
2904
  // maybe this should in llama_model_loader
2820
2905
  if (model_loader->use_mmap) {
2821
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2906
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2822
2907
  }
2823
2908
  }
2824
2909
 
@@ -2984,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2984
3069
 
2985
3070
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2986
3071
  try {
2987
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
3072
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3073
+ } catch (const std::exception & err) {
3074
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3075
+ return 1;
3076
+ }
3077
+ }
3078
+
3079
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
3080
+ try {
3081
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
2988
3082
  } catch (const std::exception & err) {
2989
3083
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2990
3084
  return 1;
@@ -2992,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2992
3086
  }
2993
3087
 
2994
3088
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2995
- return ctx->model.kv_self.n;
3089
+ return ctx->kv_self.n;
2996
3090
  }
2997
3091
 
2998
3092
  #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3017,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3017
3111
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3018
3112
  const size_t s_kv_size = sizeof(size_t);
3019
3113
  const size_t s_kv_ntok = sizeof(int);
3020
- const size_t s_kv = ctx->model.kv_self.buf.size;
3114
+ const size_t s_kv = ctx->kv_self.buf.size;
3021
3115
 
3022
3116
  const size_t s_total = (
3023
3117
  + s_rng_size
@@ -3083,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3083
3177
 
3084
3178
  // copy kv cache
3085
3179
  {
3086
- const auto & kv_self = ctx->model.kv_self;
3180
+ const auto & kv_self = ctx->kv_self;
3087
3181
  const auto & hparams = ctx->model.hparams;
3088
3182
  const int n_layer = hparams.n_layer;
3089
3183
  const int n_embd = hparams.n_embd;
@@ -3098,9 +3192,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3098
3192
  if (kv_size) {
3099
3193
  const size_t elt_size = ggml_element_size(kv_self.k);
3100
3194
 
3101
- char buffer[4096];
3102
-
3103
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3195
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3104
3196
  ggml_cgraph gf{};
3105
3197
  gf.n_threads = 1;
3106
3198
 
@@ -3189,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3189
3281
 
3190
3282
  // set kv cache
3191
3283
  {
3192
- const auto & kv_self = ctx->model.kv_self;
3284
+ const auto & kv_self = ctx->kv_self;
3193
3285
  const auto & hparams = ctx->model.hparams;
3194
3286
  const int n_layer = hparams.n_layer;
3195
3287
  const int n_embd = hparams.n_embd;
@@ -3206,9 +3298,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3206
3298
 
3207
3299
  const size_t elt_size = ggml_element_size(kv_self.k);
3208
3300
 
3209
- char buffer[4096];
3210
-
3211
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3301
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3212
3302
  ggml_cgraph gf{};
3213
3303
  gf.n_threads = 1;
3214
3304
 
@@ -3235,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3235
3325
  ggml_free(cpy_ctx);
3236
3326
  }
3237
3327
 
3238
- ctx->model.kv_self.n = kv_ntok;
3328
+ ctx->kv_self.n = kv_ntok;
3239
3329
  }
3240
3330
 
3241
3331
  const size_t nread = inp - src;
@@ -3443,9 +3533,12 @@ void llama_print_timings(struct llama_context * ctx) {
3443
3533
 
3444
3534
  fprintf(stderr, "\n");
3445
3535
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3446
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3447
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3448
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
3536
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3537
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3538
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3539
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3540
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3541
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3449
3542
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3450
3543
  }
3451
3544
 
@@ -3479,6 +3572,6 @@ const char * llama_print_system_info(void) {
3479
3572
  }
3480
3573
 
3481
3574
  // For internal test use
3482
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3575
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3483
3576
  return ctx->model.tensors_by_name;
3484
3577
  }