llama_cpp 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,13 @@
7
7
  #include <stddef.h>
8
8
 
9
9
  // Super-block size
10
+ #ifdef GGML_QKK_64
11
+ #define QK_K 64
12
+ #define K_SCALE_SIZE 4
13
+ #else
10
14
  #define QK_K 256
15
+ #define K_SCALE_SIZE 12
16
+ #endif
11
17
 
12
18
  //
13
19
  // Super-block quantization structures
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
29
35
  // weight is represented as x = a * q
30
36
  // 16 blocks of 16 elemenets each
31
37
  // Effectively 3.4375 bits per weight
38
+ #ifdef GGML_QKK_64
32
39
  typedef struct {
33
40
  uint8_t hmask[QK_K/8]; // quants - high bit
34
41
  uint8_t qs[QK_K/4]; // quants - low 2 bits
35
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
42
+ uint8_t scales[2];
36
43
  ggml_fp16_t d; // super-block scale
37
44
  } block_q3_K;
38
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
45
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
46
+ #else
47
+ typedef struct {
48
+ uint8_t hmask[QK_K/8]; // quants - high bit
49
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
50
+ uint8_t scales[12]; // scales, quantized with 6 bits
51
+ ggml_fp16_t d; // super-block scale
52
+ } block_q3_K;
53
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
54
+ #endif
39
55
 
40
56
  // 4-bit quantization
41
57
  // 16 blocks of 32 elements each
42
58
  // weight is represented as x = a * q + b
43
59
  // Effectively 4.5 bits per weight
60
+ #ifdef GGML_QKK_64
61
+ typedef struct {
62
+ ggml_fp16_t d[2]; // super-block scales/mins
63
+ uint8_t scales[2]; // 4-bit block scales/mins
64
+ uint8_t qs[QK_K/2]; // 4--bit quants
65
+ } block_q4_K;
66
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
67
+ #else
44
68
  typedef struct {
45
69
  ggml_fp16_t d; // super-block scale for quantized scales
46
70
  ggml_fp16_t dmin; // super-block scale for quantized mins
47
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
71
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
48
72
  uint8_t qs[QK_K/2]; // 4--bit quants
49
73
  } block_q4_K;
50
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
74
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
75
+ #endif
51
76
 
52
77
  // 5-bit quantization
53
78
  // 16 blocks of 32 elements each
54
79
  // weight is represented as x = a * q + b
55
80
  // Effectively 5.5 bits per weight
81
+ #ifdef GGML_QKK_64
82
+ typedef struct {
83
+ ggml_fp16_t d; // super-block scale
84
+ int8_t scales[QK_K/16]; // 8-bit block scales
85
+ uint8_t qh[QK_K/8]; // quants, high bit
86
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
87
+ } block_q5_K;
88
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
89
+ #else
56
90
  typedef struct {
57
91
  ggml_fp16_t d; // super-block scale for quantized scales
58
92
  ggml_fp16_t dmin; // super-block scale for quantized mins
59
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
93
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
60
94
  uint8_t qh[QK_K/8]; // quants, high bit
61
95
  uint8_t qs[QK_K/2]; // quants, low 4 bits
62
96
  } block_q5_K;
63
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
97
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
98
+ #endif
64
99
 
65
100
  // 6-bit quantization
66
101
  // weight is represented as x = a * q
@@ -172,12 +172,14 @@ struct llama_mmap {
172
172
  #ifdef _POSIX_MAPPED_FILES
173
173
  static constexpr bool SUPPORTED = true;
174
174
 
175
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
176
  size = file->size;
177
177
  int fd = fileno(file->fp);
178
178
  int flags = MAP_SHARED;
179
+ // prefetch/readahead impairs performance on NUMA systems
180
+ if (numa) { prefetch = 0; }
179
181
  #ifdef __linux__
180
- flags |= MAP_POPULATE;
182
+ if (prefetch) { flags |= MAP_POPULATE; }
181
183
  #endif
182
184
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
185
  if (addr == MAP_FAILED) {
@@ -191,6 +193,14 @@ struct llama_mmap {
191
193
  strerror(errno));
192
194
  }
193
195
  }
196
+ if (numa) {
197
+ // advise the kernel not to use readahead
198
+ // (because the next page might not belong on the same node)
199
+ if (madvise(addr, file->size, MADV_RANDOM)) {
200
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
201
+ strerror(errno));
202
+ }
203
+ }
194
204
  }
195
205
 
196
206
  ~llama_mmap() {
@@ -199,7 +209,9 @@ struct llama_mmap {
199
209
  #elif defined(_WIN32)
200
210
  static constexpr bool SUPPORTED = true;
201
211
 
202
- llama_mmap(struct llama_file * file, bool prefetch = true) {
212
+ llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
213
+ (void) numa;
214
+
203
215
  size = file->size;
204
216
 
205
217
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -244,8 +256,10 @@ struct llama_mmap {
244
256
  #else
245
257
  static constexpr bool SUPPORTED = false;
246
258
 
247
- llama_mmap(struct llama_file *, bool prefetch = true) {
248
- (void)prefetch;
259
+ llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
260
+ (void) prefetch;
261
+ (void) numa;
262
+
249
263
  throw std::runtime_error(std::string("mmap not supported"));
250
264
  }
251
265
  #endif
@@ -21,9 +21,13 @@
21
21
  #endif
22
22
  #ifdef GGML_USE_K_QUANTS
23
23
  #ifndef QK_K
24
+ #ifdef GGML_QKK_64
25
+ #define QK_K 64
26
+ #else
24
27
  #define QK_K 256
25
28
  #endif
26
29
  #endif
30
+ #endif
27
31
 
28
32
  #include <array>
29
33
  #include <ctime>
@@ -182,6 +186,19 @@ struct llama_kv_cache {
182
186
  }
183
187
  };
184
188
 
189
+ struct llama_vocab {
190
+ using id = int32_t;
191
+ using token = std::string;
192
+
193
+ struct token_score {
194
+ token tok;
195
+ float score;
196
+ };
197
+
198
+ std::unordered_map<token, id> token_to_id;
199
+ std::vector<token_score> id_to_token;
200
+ };
201
+
185
202
  struct llama_model {
186
203
  e_model type = MODEL_UNKNOWN;
187
204
 
@@ -198,10 +215,6 @@ struct llama_model {
198
215
  // context
199
216
  struct ggml_context * ctx = NULL;
200
217
 
201
- // key + value cache for the self attention
202
- // TODO: move to llama_state
203
- struct llama_kv_cache kv_self;
204
-
205
218
  // the model memory buffer
206
219
  llama_ctx_buffer buf;
207
220
 
@@ -215,6 +228,11 @@ struct llama_model {
215
228
  // for quantize-stats only
216
229
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
217
230
 
231
+ int64_t t_load_us = 0;
232
+ int64_t t_start_us = 0;
233
+
234
+ llama_vocab vocab;
235
+
218
236
  ~llama_model() {
219
237
  if (ctx) {
220
238
  ggml_free(ctx);
@@ -233,24 +251,11 @@ struct llama_model {
233
251
  }
234
252
  };
235
253
 
236
- struct llama_vocab {
237
- using id = int32_t;
238
- using token = std::string;
239
-
240
- struct token_score {
241
- token tok;
242
- float score;
243
- };
244
-
245
- std::unordered_map<token, id> token_to_id;
246
- std::vector<token_score> id_to_token;
247
- };
248
-
249
254
  struct llama_context {
255
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
+
250
257
  std::mt19937 rng;
251
258
 
252
- int64_t t_load_us = 0;
253
- int64_t t_start_us = 0;
254
259
  bool has_evaluated_once = false;
255
260
 
256
261
  int64_t t_sample_us = 0;
@@ -261,8 +266,16 @@ struct llama_context {
261
266
  int32_t n_eval = 0; // number of eval calls
262
267
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
263
268
 
264
- llama_model model;
265
- llama_vocab vocab;
269
+ const llama_model & model;
270
+ const llama_vocab & vocab;
271
+
272
+ bool model_owner = false;
273
+
274
+ int64_t t_load_us;
275
+ int64_t t_start_us;
276
+
277
+ // key + value cache for the self attention
278
+ struct llama_kv_cache kv_self;
266
279
 
267
280
  size_t mem_per_token = 0;
268
281
 
@@ -761,7 +774,7 @@ struct llama_model_loader {
761
774
  }
762
775
 
763
776
  if (use_mmap) {
764
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
777
+ mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
765
778
  if (lmlock) {
766
779
  lmlock->init(mapping->addr);
767
780
  }
@@ -964,7 +977,7 @@ bool llama_mlock_supported() {
964
977
  return llama_mlock::SUPPORTED;
965
978
  }
966
979
 
967
- void llama_init_backend() {
980
+ void llama_init_backend(bool numa) {
968
981
  ggml_time_init();
969
982
 
970
983
  // needed to initialize f16 tables
@@ -973,6 +986,10 @@ void llama_init_backend() {
973
986
  struct ggml_context * ctx = ggml_init(params);
974
987
  ggml_free(ctx);
975
988
  }
989
+
990
+ if (numa) {
991
+ ggml_numa_init();
992
+ }
976
993
  }
977
994
 
978
995
  int64_t llama_time_us() {
@@ -1033,7 +1050,8 @@ static const char *llama_model_type_name(e_model type) {
1033
1050
 
1034
1051
  static void llama_model_load_internal(
1035
1052
  const std::string & fname,
1036
- llama_context & lctx,
1053
+ llama_model & model,
1054
+ llama_vocab & vocab,
1037
1055
  int n_ctx,
1038
1056
  int n_batch,
1039
1057
  int n_gpu_layers,
@@ -1047,12 +1065,11 @@ static void llama_model_load_internal(
1047
1065
  llama_progress_callback progress_callback,
1048
1066
  void * progress_callback_user_data) {
1049
1067
 
1050
- lctx.t_start_us = ggml_time_us();
1068
+ model.t_start_us = ggml_time_us();
1051
1069
 
1052
1070
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1053
1071
 
1054
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1055
- auto & model = lctx.model;
1072
+ vocab = std::move(ml->file_loaders.at(0)->vocab);
1056
1073
  model.hparams = ml->file_loaders.at(0)->hparams;
1057
1074
  model.n_gpu_layers = n_gpu_layers;
1058
1075
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1122,15 +1139,15 @@ static void llama_model_load_internal(
1122
1139
 
1123
1140
  // create the ggml context
1124
1141
  {
1125
- lctx.model.buf.resize(ctx_size);
1142
+ model.buf.resize(ctx_size);
1126
1143
  if (use_mlock) {
1127
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1128
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
1144
+ model.mlock_buf.init(model.buf.addr);
1145
+ model.mlock_buf.grow_to(model.buf.size);
1129
1146
  }
1130
1147
 
1131
1148
  struct ggml_init_params params = {
1132
- /*.mem_size =*/ lctx.model.buf.size,
1133
- /*.mem_buffer =*/ lctx.model.buf.addr,
1149
+ /*.mem_size =*/ model.buf.size,
1150
+ /*.mem_buffer =*/ model.buf.addr,
1134
1151
  /*.no_alloc =*/ ml->use_mmap,
1135
1152
  };
1136
1153
 
@@ -1311,7 +1328,7 @@ static void llama_model_load_internal(
1311
1328
  }
1312
1329
  #endif
1313
1330
 
1314
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1331
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1315
1332
 
1316
1333
  if (progress_callback) {
1317
1334
  progress_callback(1.0f, progress_callback_user_data);
@@ -1321,12 +1338,13 @@ static void llama_model_load_internal(
1321
1338
 
1322
1339
  // loading time will be recalculate after the first eval, so
1323
1340
  // we take page faults deferred by mmap() into consideration
1324
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1341
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1325
1342
  }
1326
1343
 
1327
1344
  static bool llama_model_load(
1328
1345
  const std::string & fname,
1329
- llama_context & lctx,
1346
+ llama_model & model,
1347
+ llama_vocab & vocab,
1330
1348
  int n_ctx,
1331
1349
  int n_batch,
1332
1350
  int n_gpu_layers,
@@ -1340,7 +1358,7 @@ static bool llama_model_load(
1340
1358
  llama_progress_callback progress_callback,
1341
1359
  void *progress_callback_user_data) {
1342
1360
  try {
1343
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1361
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1344
1362
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1345
1363
  return true;
1346
1364
  } catch (const std::exception & err) {
@@ -1378,7 +1396,7 @@ static bool llama_eval_internal(
1378
1396
  const auto & model = lctx.model;
1379
1397
  const auto & hparams = model.hparams;
1380
1398
 
1381
- const auto & kv_self = model.kv_self;
1399
+ const auto & kv_self = lctx.kv_self;
1382
1400
 
1383
1401
  LLAMA_ASSERT(!!kv_self.ctx);
1384
1402
 
@@ -1473,11 +1491,11 @@ static bool llama_eval_internal(
1473
1491
  offload_func_kq(tmpq);
1474
1492
  ggml_set_name(tmpq, "tmpq");
1475
1493
 
1476
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1494
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1477
1495
  offload_func_kq(Kcur);
1478
1496
  ggml_set_name(Kcur, "Kcur");
1479
1497
 
1480
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1498
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1481
1499
  offload_func_kq(Qcur);
1482
1500
  ggml_set_name(Qcur, "Qcur");
1483
1501
 
@@ -1726,7 +1744,7 @@ static bool llama_eval_internal(
1726
1744
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1727
1745
 
1728
1746
  // update kv token count
1729
- lctx.model.kv_self.n = n_past + N;
1747
+ lctx.kv_self.n = n_past + N;
1730
1748
 
1731
1749
  // extract logits
1732
1750
  {
@@ -2005,9 +2023,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2005
2023
  for (size_t i = 0; i < candidates->size; ++i) {
2006
2024
  cum_sum += candidates->data[i].p;
2007
2025
 
2008
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2009
- if (cum_sum > p && i >= min_keep) {
2010
- last_idx = i;
2026
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
2027
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
2028
+ if (cum_sum >= p && i + 1 >= min_keep) {
2029
+ last_idx = i + 1;
2011
2030
  break;
2012
2031
  }
2013
2032
  }
@@ -2459,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2459
2478
  std::vector<std::thread> workers;
2460
2479
  std::mutex mutex;
2461
2480
 
2481
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2482
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2483
+ };
2484
+
2462
2485
  size_t idx = 0;
2463
2486
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2464
2487
  llama_buffer read_data;
@@ -2513,15 +2536,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2513
2536
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2514
2537
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2515
2538
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2516
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2517
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2539
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2540
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2541
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2518
2542
  ++i_attention_wv;
2519
2543
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2520
2544
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2521
2545
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2522
2546
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2523
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2524
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2547
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2548
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2525
2549
  ++i_feed_forward_w2;
2526
2550
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2527
2551
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2634,12 +2658,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2634
2658
  // interface implementation
2635
2659
  //
2636
2660
 
2637
- struct llama_context * llama_init_from_file(
2661
+ struct llama_model * llama_load_model_from_file(
2638
2662
  const char * path_model,
2639
2663
  struct llama_context_params params) {
2640
2664
  ggml_time_init();
2641
2665
 
2642
- llama_context * ctx = new llama_context;
2666
+ llama_model * model = new llama_model;
2667
+
2668
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
+
2670
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2671
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
+ delete model;
2674
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2675
+ return nullptr;
2676
+ }
2677
+
2678
+ return model;
2679
+ }
2680
+
2681
+ void llama_free_model(struct llama_model * model) {
2682
+ delete model;
2683
+ }
2684
+
2685
+ struct llama_context * llama_new_context_with_model(
2686
+ struct llama_model * model,
2687
+ struct llama_context_params params) {
2688
+
2689
+ if (!model) {
2690
+ return nullptr;
2691
+ }
2692
+
2693
+ llama_context * ctx = new llama_context(*model, model->vocab);
2643
2694
 
2644
2695
  if (params.seed < 0) {
2645
2696
  params.seed = time(NULL);
@@ -2667,24 +2718,16 @@ struct llama_context * llama_init_from_file(
2667
2718
 
2668
2719
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2669
2720
 
2670
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2671
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2672
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2673
- fprintf(stderr, "%s: failed to load model\n", __func__);
2674
- llama_free(ctx);
2675
- return nullptr;
2676
- }
2677
-
2678
2721
  // reserve memory for context buffers
2679
2722
  if (!params.vocab_only) {
2680
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2723
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2681
2724
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2682
2725
  llama_free(ctx);
2683
2726
  return nullptr;
2684
2727
  }
2685
2728
 
2686
2729
  {
2687
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2730
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2688
2731
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2689
2732
  }
2690
2733
 
@@ -2736,8 +2779,8 @@ struct llama_context * llama_init_from_file(
2736
2779
 
2737
2780
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2738
2781
 
2739
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2740
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
2782
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2783
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2741
2784
 
2742
2785
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2743
2786
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
@@ -2748,7 +2791,23 @@ struct llama_context * llama_init_from_file(
2748
2791
  return ctx;
2749
2792
  }
2750
2793
 
2794
+ struct llama_context * llama_init_from_file(
2795
+ const char * path_model,
2796
+ struct llama_context_params params) {
2797
+
2798
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2799
+ if (!model) {
2800
+ return nullptr;
2801
+ }
2802
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2803
+ ctx->model_owner = true;
2804
+ return ctx;
2805
+ }
2806
+
2751
2807
  void llama_free(struct llama_context * ctx) {
2808
+ if (ctx->model_owner) {
2809
+ delete &ctx->model;
2810
+ }
2752
2811
  delete ctx;
2753
2812
  }
2754
2813
 
@@ -2765,11 +2824,9 @@ int llama_model_quantize(
2765
2824
  }
2766
2825
  }
2767
2826
 
2768
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2827
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2769
2828
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2770
2829
 
2771
- auto & model = ctx->model;
2772
-
2773
2830
  const int64_t t_start_lora_us = ggml_time_us();
2774
2831
 
2775
2832
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2846,7 +2903,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2846
2903
 
2847
2904
  // maybe this should in llama_model_loader
2848
2905
  if (model_loader->use_mmap) {
2849
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2906
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
2850
2907
  }
2851
2908
  }
2852
2909
 
@@ -3012,7 +3069,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
3012
3069
 
3013
3070
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
3014
3071
  try {
3015
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
3072
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3073
+ } catch (const std::exception & err) {
3074
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3075
+ return 1;
3076
+ }
3077
+ }
3078
+
3079
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
3080
+ try {
3081
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3016
3082
  } catch (const std::exception & err) {
3017
3083
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3018
3084
  return 1;
@@ -3020,7 +3086,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
3020
3086
  }
3021
3087
 
3022
3088
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3023
- return ctx->model.kv_self.n;
3089
+ return ctx->kv_self.n;
3024
3090
  }
3025
3091
 
3026
3092
  #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3045,7 +3111,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3045
3111
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3046
3112
  const size_t s_kv_size = sizeof(size_t);
3047
3113
  const size_t s_kv_ntok = sizeof(int);
3048
- const size_t s_kv = ctx->model.kv_self.buf.size;
3114
+ const size_t s_kv = ctx->kv_self.buf.size;
3049
3115
 
3050
3116
  const size_t s_total = (
3051
3117
  + s_rng_size
@@ -3111,7 +3177,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3111
3177
 
3112
3178
  // copy kv cache
3113
3179
  {
3114
- const auto & kv_self = ctx->model.kv_self;
3180
+ const auto & kv_self = ctx->kv_self;
3115
3181
  const auto & hparams = ctx->model.hparams;
3116
3182
  const int n_layer = hparams.n_layer;
3117
3183
  const int n_embd = hparams.n_embd;
@@ -3215,7 +3281,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3215
3281
 
3216
3282
  // set kv cache
3217
3283
  {
3218
- const auto & kv_self = ctx->model.kv_self;
3284
+ const auto & kv_self = ctx->kv_self;
3219
3285
  const auto & hparams = ctx->model.hparams;
3220
3286
  const int n_layer = hparams.n_layer;
3221
3287
  const int n_embd = hparams.n_embd;
@@ -3259,7 +3325,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3259
3325
  ggml_free(cpy_ctx);
3260
3326
  }
3261
3327
 
3262
- ctx->model.kv_self.n = kv_ntok;
3328
+ ctx->kv_self.n = kv_ntok;
3263
3329
  }
3264
3330
 
3265
3331
  const size_t nread = inp - src;
@@ -3506,6 +3572,6 @@ const char * llama_print_system_info(void) {
3506
3572
  }
3507
3573
 
3508
3574
  // For internal test use
3509
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3575
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3510
3576
  return ctx->model.tensors_by_name;
3511
3577
  }