llama_cpp 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,13 @@
7
7
  #include <stddef.h>
8
8
 
9
9
  // Super-block size
10
+ #ifdef GGML_QKK_64
11
+ #define QK_K 64
12
+ #define K_SCALE_SIZE 4
13
+ #else
10
14
  #define QK_K 256
15
+ #define K_SCALE_SIZE 12
16
+ #endif
11
17
 
12
18
  //
13
19
  // Super-block quantization structures
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
29
35
  // weight is represented as x = a * q
30
36
  // 16 blocks of 16 elemenets each
31
37
  // Effectively 3.4375 bits per weight
38
+ #ifdef GGML_QKK_64
32
39
  typedef struct {
33
40
  uint8_t hmask[QK_K/8]; // quants - high bit
34
41
  uint8_t qs[QK_K/4]; // quants - low 2 bits
35
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
42
+ uint8_t scales[2];
36
43
  ggml_fp16_t d; // super-block scale
37
44
  } block_q3_K;
38
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
45
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
46
+ #else
47
+ typedef struct {
48
+ uint8_t hmask[QK_K/8]; // quants - high bit
49
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
50
+ uint8_t scales[12]; // scales, quantized with 6 bits
51
+ ggml_fp16_t d; // super-block scale
52
+ } block_q3_K;
53
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
54
+ #endif
39
55
 
40
56
  // 4-bit quantization
41
57
  // 16 blocks of 32 elements each
42
58
  // weight is represented as x = a * q + b
43
59
  // Effectively 4.5 bits per weight
60
+ #ifdef GGML_QKK_64
61
+ typedef struct {
62
+ ggml_fp16_t d[2]; // super-block scales/mins
63
+ uint8_t scales[2]; // 4-bit block scales/mins
64
+ uint8_t qs[QK_K/2]; // 4--bit quants
65
+ } block_q4_K;
66
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
67
+ #else
44
68
  typedef struct {
45
69
  ggml_fp16_t d; // super-block scale for quantized scales
46
70
  ggml_fp16_t dmin; // super-block scale for quantized mins
47
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
71
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
48
72
  uint8_t qs[QK_K/2]; // 4--bit quants
49
73
  } block_q4_K;
50
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
74
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
75
+ #endif
51
76
 
52
77
  // 5-bit quantization
53
78
  // 16 blocks of 32 elements each
54
79
  // weight is represented as x = a * q + b
55
80
  // Effectively 5.5 bits per weight
81
+ #ifdef GGML_QKK_64
82
+ typedef struct {
83
+ ggml_fp16_t d; // super-block scale
84
+ int8_t scales[QK_K/16]; // 8-bit block scales
85
+ uint8_t qh[QK_K/8]; // quants, high bit
86
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
87
+ } block_q5_K;
88
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
89
+ #else
56
90
  typedef struct {
57
91
  ggml_fp16_t d; // super-block scale for quantized scales
58
92
  ggml_fp16_t dmin; // super-block scale for quantized mins
59
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
93
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
60
94
  uint8_t qh[QK_K/8]; // quants, high bit
61
95
  uint8_t qs[QK_K/2]; // quants, low 4 bits
62
96
  } block_q5_K;
63
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
97
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
98
+ #endif
64
99
 
65
100
  // 6-bit quantization
66
101
  // weight is represented as x = a * q
@@ -172,12 +172,14 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
179
+ // prefetch/readahead impairs performance on NUMA systems
180
+ if (numa) { prefetch = 0; }
179
181
  #ifdef __linux__
180
- flags |= MAP_POPULATE;
182
+ if (prefetch) { flags |= MAP_POPULATE; }
181
183
  #endif
182
184
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
185
  if (addr == MAP_FAILED) {
@@ -191,6 +193,14 @@ struct llama_mmap {
191
193
  strerror(errno));
192
194
  }
193
195
  }
196
+ if (numa) {
197
+ // advise the kernel not to use readahead
198
+ // (because the next page might not belong on the same node)
199
+ if (madvise(addr, file->size, MADV_RANDOM)) {
200
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
201
+ strerror(errno));
202
+ }
203
+ }
194
204
  }
195
205
 
196
206
  ~llama_mmap() {
@@ -199,7 +209,9 @@ struct llama_mmap {
199
209
  #elif defined(_WIN32)
200
210
  static constexpr bool SUPPORTED = true;
201
211
 
202
- llama_mmap(struct llama_file * file, bool prefetch = true) {
212
+ llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
213
+ (void) numa;
214
+
203
215
  size = file->size;
204
216
 
205
217
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -244,8 +256,10 @@ struct llama_mmap {
244
256
  #else
245
257
  static constexpr bool SUPPORTED = false;
246
258
 
247
- llama_mmap(struct llama_file *, bool prefetch = true) {
248
- (void)prefetch;
259
+ llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
260
+ (void) prefetch;
261
+ (void) numa;
262
+
249
263
  throw std::runtime_error(std::string("mmap not supported"));
250
264
  }
251
265
  #endif
@@ -21,9 +21,13 @@
21
21
  #endif
22
22
  #ifdef GGML_USE_K_QUANTS
23
23
  #ifndef QK_K
24
+ #ifdef GGML_QKK_64
25
+ #define QK_K 64
26
+ #else
24
27
  #define QK_K 256
25
28
  #endif
26
29
  #endif
30
+ #endif
27
31
 
28
32
  #include <array>
29
33
  #include <ctime>
@@ -182,6 +186,19 @@ struct llama_kv_cache {
182
186
  }
183
187
  };
184
188
 
189
+ struct llama_vocab {
190
+ using id = int32_t;
191
+ using token = std::string;
192
+
193
+ struct token_score {
194
+ token tok;
195
+ float score;
196
+ };
197
+
198
+ std::unordered_map<token, id> token_to_id;
199
+ std::vector<token_score> id_to_token;
200
+ };
201
+
185
202
  struct llama_model {
186
203
  e_model type = MODEL_UNKNOWN;
187
204
 
@@ -198,10 +215,6 @@ struct llama_model {
198
215
  // context
199
216
  struct ggml_context * ctx = NULL;
200
217
 
201
- // key + value cache for the self attention
202
- // TODO: move to llama_state
203
- struct llama_kv_cache kv_self;
204
-
205
218
  // the model memory buffer
206
219
  llama_ctx_buffer buf;
207
220
 
@@ -215,6 +228,11 @@ struct llama_model {
215
228
  // for quantize-stats only
216
229
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
217
230
 
231
+ int64_t t_load_us = 0;
232
+ int64_t t_start_us = 0;
233
+
234
+ llama_vocab vocab;
235
+
218
236
  ~llama_model() {
219
237
  if (ctx) {
220
238
  ggml_free(ctx);
@@ -233,24 +251,11 @@ struct llama_model {
233
251
  }
234
252
  };
235
253
 
236
- struct llama_vocab {
237
- using id = int32_t;
238
- using token = std::string;
239
-
240
- struct token_score {
241
- token tok;
242
- float score;
243
- };
244
-
245
- std::unordered_map<token, id> token_to_id;
246
- std::vector<token_score> id_to_token;
247
- };
248
-
249
254
  struct llama_context {
255
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
+
250
257
  std::mt19937 rng;
251
258
 
252
- int64_t t_load_us = 0;
253
- int64_t t_start_us = 0;
254
259
  bool has_evaluated_once = false;
255
260
 
256
261
  int64_t t_sample_us = 0;
@@ -261,8 +266,16 @@ struct llama_context {
261
266
  int32_t n_eval = 0; // number of eval calls
262
267
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
263
268
 
264
- llama_model model;
265
- llama_vocab vocab;
269
+ const llama_model & model;
270
+ const llama_vocab & vocab;
271
+
272
+ bool model_owner = false;
273
+
274
+ int64_t t_load_us;
275
+ int64_t t_start_us;
276
+
277
+ // key + value cache for the self attention
278
+ struct llama_kv_cache kv_self;
266
279
 
267
280
  size_t mem_per_token = 0;
268
281
 
@@ -761,7 +774,7 @@ struct llama_model_loader {
761
774
  }
762
775
 
763
776
  if (use_mmap) {
764
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
777
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
765
778
  if (lmlock) {
766
779
  lmlock->init(mapping->addr);
767
780
  }
@@ -964,7 +977,7 @@ bool llama_mlock_supported() {
964
977
  return llama_mlock::SUPPORTED;
965
978
  }
966
979
 
967
- void llama_init_backend() {
980
+ void llama_init_backend(bool numa) {
968
981
  ggml_time_init();
969
982
 
970
983
  // needed to initialize f16 tables
@@ -973,6 +986,10 @@ void llama_init_backend() {
973
986
  struct ggml_context * ctx = ggml_init(params);
974
987
  ggml_free(ctx);
975
988
  }
989
+
990
+ if (numa) {
991
+ ggml_numa_init();
992
+ }
976
993
  }
977
994
 
978
995
  int64_t llama_time_us() {
@@ -1033,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
1033
1050
 
1034
1051
  static void llama_model_load_internal(
1035
1052
  const std::string & fname,
1036
- llama_context & lctx,
1053
+ llama_model & model,
1054
+ llama_vocab & vocab,
1037
1055
  int n_ctx,
1038
1056
  int n_batch,
1039
1057
  int n_gpu_layers,
@@ -1047,12 +1065,11 @@ static void llama_model_load_internal(
1047
1065
  llama_progress_callback progress_callback,
1048
1066
  void * progress_callback_user_data) {
1049
1067
 
1050
- lctx.t_start_us = ggml_time_us();
1068
+ model.t_start_us = ggml_time_us();
1051
1069
 
1052
1070
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1053
1071
 
1054
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1055
- auto & model = lctx.model;
1072
+ vocab = std::move(ml->file_loaders.at(0)->vocab);
1056
1073
  model.hparams = ml->file_loaders.at(0)->hparams;
1057
1074
  model.n_gpu_layers = n_gpu_layers;
1058
1075
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1122,15 +1139,15 @@ static void llama_model_load_internal(
1122
1139
 
1123
1140
  // create the ggml context
1124
1141
  {
1125
- lctx.model.buf.resize(ctx_size);
1142
+ model.buf.resize(ctx_size);
1126
1143
  if (use_mlock) {
1127
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1128
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
1144
+ model.mlock_buf.init(model.buf.addr);
1145
+ model.mlock_buf.grow_to(model.buf.size);
1129
1146
  }
1130
1147
 
1131
1148
  struct ggml_init_params params = {
1132
- /*.mem_size =*/ lctx.model.buf.size,
1133
- /*.mem_buffer =*/ lctx.model.buf.addr,
1149
+ /*.mem_size =*/ model.buf.size,
1150
+ /*.mem_buffer =*/ model.buf.addr,
1134
1151
  /*.no_alloc =*/ ml->use_mmap,
1135
1152
  };
1136
1153
 
@@ -1311,7 +1328,7 @@ static void llama_model_load_internal(
1311
1328
  }
1312
1329
  #endif
1313
1330
 
1314
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1331
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1315
1332
 
1316
1333
  if (progress_callback) {
1317
1334
  progress_callback(1.0f, progress_callback_user_data);
@@ -1321,12 +1338,13 @@ static void llama_model_load_internal(
1321
1338
 
1322
1339
  // loading time will be recalculate after the first eval, so
1323
1340
  // we take page faults deferred by mmap() into consideration
1324
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1341
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1325
1342
  }
1326
1343
 
1327
1344
  static bool llama_model_load(
1328
1345
  const std::string & fname,
1329
- llama_context & lctx,
1346
+ llama_model & model,
1347
+ llama_vocab & vocab,
1330
1348
  int n_ctx,
1331
1349
  int n_batch,
1332
1350
  int n_gpu_layers,
@@ -1340,7 +1358,7 @@ static bool llama_model_load(
1340
1358
  llama_progress_callback progress_callback,
1341
1359
  void *progress_callback_user_data) {
1342
1360
  try {
1343
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1361
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1344
1362
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1345
1363
  return true;
1346
1364
  } catch (const std::exception & err) {
@@ -1378,7 +1396,7 @@ static bool llama_eval_internal(
1378
1396
  const auto & model = lctx.model;
1379
1397
  const auto & hparams = model.hparams;
1380
1398
 
1381
- const auto & kv_self = model.kv_self;
1399
+ const auto & kv_self = lctx.kv_self;
1382
1400
 
1383
1401
  LLAMA_ASSERT(!!kv_self.ctx);
1384
1402
 
@@ -1473,11 +1491,11 @@ static bool llama_eval_internal(
1473
1491
  offload_func_kq(tmpq);
1474
1492
  ggml_set_name(tmpq, "tmpq");
1475
1493
 
1476
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1494
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1477
1495
  offload_func_kq(Kcur);
1478
1496
  ggml_set_name(Kcur, "Kcur");
1479
1497
 
1480
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1498
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1481
1499
  offload_func_kq(Qcur);
1482
1500
  ggml_set_name(Qcur, "Qcur");
1483
1501
 
@@ -1726,7 +1744,7 @@ static bool llama_eval_internal(
1726
1744
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1727
1745
 
1728
1746
  // update kv token count
1729
- lctx.model.kv_self.n = n_past + N;
1747
+ lctx.kv_self.n = n_past + N;
1730
1748
 
1731
1749
  // extract logits
1732
1750
  {
@@ -2005,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2005
2023
  for (size_t i = 0; i < candidates->size; ++i) {
2006
2024
  cum_sum += candidates->data[i].p;
2007
2025
 
2008
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2009
- if (cum_sum > p && i >= min_keep) {
2010
- last_idx = i;
2026
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
2027
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
2028
+ if (cum_sum >= p && i + 1 >= min_keep) {
2029
+ last_idx = i + 1;
2011
2030
  break;
2012
2031
  }
2013
2032
  }
@@ -2459,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2459
2478
  std::vector<std::thread> workers;
2460
2479
  std::mutex mutex;
2461
2480
 
2481
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2482
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2483
+ };
2484
+
2462
2485
  size_t idx = 0;
2463
2486
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2464
2487
  llama_buffer read_data;
@@ -2513,15 +2536,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2513
2536
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2514
2537
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2515
2538
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2516
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2517
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2539
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2540
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2541
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2518
2542
  ++i_attention_wv;
2519
2543
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2520
2544
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2521
2545
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2522
2546
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2523
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2524
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2547
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2548
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2525
2549
  ++i_feed_forward_w2;
2526
2550
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2527
2551
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2634,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2634
2658
  // interface implementation
2635
2659
  //
2636
2660
 
2637
- struct llama_context * llama_init_from_file(
2661
+ struct llama_model * llama_load_model_from_file(
2638
2662
  const char * path_model,
2639
2663
  struct llama_context_params params) {
2640
2664
  ggml_time_init();
2641
2665
 
2642
- llama_context * ctx = new llama_context;
2666
+ llama_model * model = new llama_model;
2667
+
2668
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
+
2670
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2671
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
+ delete model;
2674
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2675
+ return nullptr;
2676
+ }
2677
+
2678
+ return model;
2679
+ }
2680
+
2681
+ void llama_free_model(struct llama_model * model) {
2682
+ delete model;
2683
+ }
2684
+
2685
+ struct llama_context * llama_new_context_with_model(
2686
+ struct llama_model * model,
2687
+ struct llama_context_params params) {
2688
+
2689
+ if (!model) {
2690
+ return nullptr;
2691
+ }
2692
+
2693
+ llama_context * ctx = new llama_context(*model, model->vocab);
2643
2694
 
2644
2695
  if (params.seed < 0) {
2645
2696
  params.seed = time(NULL);
@@ -2667,24 +2718,16 @@ struct llama_context * llama_init_from_file(
2667
2718
 
2668
2719
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
2720
 
2670
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2671
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
- fprintf(stderr, "%s: failed to load model\n", __func__);
2674
- llama_free(ctx);
2675
- return nullptr;
2676
- }
2677
-
2678
2721
  // reserve memory for context buffers
2679
2722
  if (!params.vocab_only) {
2680
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2723
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2681
2724
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2682
2725
  llama_free(ctx);
2683
2726
  return nullptr;
2684
2727
  }
2685
2728
 
2686
2729
  {
2687
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2730
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2688
2731
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2689
2732
  }
2690
2733
 
@@ -2736,8 +2779,8 @@ struct llama_context * llama_init_from_file(
2736
2779
 
2737
2780
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2738
2781
 
2739
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2782
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2783
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2741
2784
 
2742
2785
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
2786
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@@ -2748,7 +2791,23 @@ struct llama_context * llama_init_from_file(
2748
2791
  return ctx;
2749
2792
  }
2750
2793
 
2794
+ struct llama_context * llama_init_from_file(
2795
+ const char * path_model,
2796
+ struct llama_context_params params) {
2797
+
2798
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2799
+ if (!model) {
2800
+ return nullptr;
2801
+ }
2802
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2803
+ ctx->model_owner = true;
2804
+ return ctx;
2805
+ }
2806
+
2751
2807
  void llama_free(struct llama_context * ctx) {
2808
+ if (ctx->model_owner) {
2809
+ delete &ctx->model;
2810
+ }
2752
2811
  delete ctx;
2753
2812
  }
2754
2813
 
@@ -2765,11 +2824,9 @@ int llama_model_quantize(
2765
2824
  }
2766
2825
  }
2767
2826
 
2768
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2827
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2769
2828
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2770
2829
 
2771
- auto & model = ctx->model;
2772
-
2773
2830
  const int64_t t_start_lora_us = ggml_time_us();
2774
2831
 
2775
2832
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2846,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2846
2903
 
2847
2904
  // maybe this should in llama_model_loader
2848
2905
  if (model_loader->use_mmap) {
2849
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2906
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2850
2907
  }
2851
2908
  }
2852
2909
 
@@ -3012,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
3012
3069
 
3013
3070
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
3014
3071
  try {
3015
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
3072
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3073
+ } catch (const std::exception & err) {
3074
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3075
+ return 1;
3076
+ }
3077
+ }
3078
+
3079
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
3080
+ try {
3081
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3016
3082
  } catch (const std::exception & err) {
3017
3083
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3018
3084
  return 1;
@@ -3020,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3020
3086
  }
3021
3087
 
3022
3088
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3023
- return ctx->model.kv_self.n;
3089
+ return ctx->kv_self.n;
3024
3090
  }
3025
3091
 
3026
3092
  #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3045,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3045
3111
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3046
3112
  const size_t s_kv_size = sizeof(size_t);
3047
3113
  const size_t s_kv_ntok = sizeof(int);
3048
- const size_t s_kv = ctx->model.kv_self.buf.size;
3114
+ const size_t s_kv = ctx->kv_self.buf.size;
3049
3115
 
3050
3116
  const size_t s_total = (
3051
3117
  + s_rng_size
@@ -3111,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3111
3177
 
3112
3178
  // copy kv cache
3113
3179
  {
3114
- const auto & kv_self = ctx->model.kv_self;
3180
+ const auto & kv_self = ctx->kv_self;
3115
3181
  const auto & hparams = ctx->model.hparams;
3116
3182
  const int n_layer = hparams.n_layer;
3117
3183
  const int n_embd = hparams.n_embd;
@@ -3215,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3215
3281
 
3216
3282
  // set kv cache
3217
3283
  {
3218
- const auto & kv_self = ctx->model.kv_self;
3284
+ const auto & kv_self = ctx->kv_self;
3219
3285
  const auto & hparams = ctx->model.hparams;
3220
3286
  const int n_layer = hparams.n_layer;
3221
3287
  const int n_embd = hparams.n_embd;
@@ -3259,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3259
3325
  ggml_free(cpy_ctx);
3260
3326
  }
3261
3327
 
3262
- ctx->model.kv_self.n = kv_ntok;
3328
+ ctx->kv_self.n = kv_ntok;
3263
3329
  }
3264
3330
 
3265
3331
  const size_t nread = inp - src;
@@ -3506,6 +3572,6 @@ const char * llama_print_system_info(void) {
3506
3572
  }
3507
3573
 
3508
3574
  // For internal test use
3509
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3575
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3510
3576
  return ctx->model.tensors_by_name;
3511
3577
  }