@fugood/llama.node 1.4.15 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/lib/binding.ts +1 -5
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +2 -2
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +76 -61
  6. package/src/LlamaContext.cpp +20 -32
  7. package/src/llama.cpp/common/CMakeLists.txt +12 -0
  8. package/src/llama.cpp/common/arg.cpp +20 -0
  9. package/src/llama.cpp/common/chat.cpp +289 -34
  10. package/src/llama.cpp/common/chat.h +16 -13
  11. package/src/llama.cpp/common/common.cpp +0 -1
  12. package/src/llama.cpp/common/common.h +28 -25
  13. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  14. package/src/llama.cpp/common/jinja/caps.h +24 -0
  15. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  16. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  17. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  18. package/src/llama.cpp/common/jinja/parser.h +21 -0
  19. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  20. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  21. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  22. package/src/llama.cpp/common/jinja/string.h +58 -0
  23. package/src/llama.cpp/common/jinja/utils.h +49 -0
  24. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  25. package/src/llama.cpp/common/jinja/value.h +464 -0
  26. package/src/llama.cpp/common/sampling.cpp +52 -19
  27. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  30. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  32. package/src/llama.cpp/include/llama-cpp.h +3 -1
  33. package/src/llama.cpp/include/llama.h +29 -2
  34. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  35. package/src/llama.cpp/src/llama-adapter.h +1 -3
  36. package/src/llama.cpp/src/llama-context.cpp +232 -144
  37. package/src/llama.cpp/src/llama-context.h +10 -0
  38. package/src/llama.cpp/src/llama-cparams.h +2 -0
  39. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  40. package/src/llama.cpp/src/llama-hparams.h +38 -1
  41. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  42. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  43. package/src/llama.cpp/src/llama-mmap.cpp +5 -1
  44. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  45. package/src/llama.cpp/src/llama-model.cpp +5 -1
  46. package/src/llama.cpp/src/llama-model.h +3 -2
  47. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
@@ -3,6 +3,7 @@
3
3
  #include "llama.h"
4
4
 
5
5
  #include <array>
6
+ #include <cassert>
6
7
 
7
8
  // bump if necessary
8
9
  #define LLAMA_MAX_LAYERS 512
@@ -274,9 +275,45 @@ struct llama_hparams {
274
275
  uint32_t n_layer_kv() const;
275
276
 
276
277
  // note that this function uses different SWA parameters from those in the hparams
278
+ // note: inlined on purpose for performance reasons
277
279
  // TODO: think of a better place for this function
278
280
  // TODO: pack the SWA params in a struct?
279
- static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
281
+ static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
282
+ assert(p0 >= 0 && p1 >= 0);
283
+
284
+ switch (swa_type) {
285
+ case LLAMA_SWA_TYPE_NONE:
286
+ {
287
+ } break;
288
+ case LLAMA_SWA_TYPE_STANDARD:
289
+ {
290
+ if (p1 - p0 >= (int32_t) n_swa) {
291
+ return true;
292
+ }
293
+ } break;
294
+ case LLAMA_SWA_TYPE_CHUNKED:
295
+ {
296
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
297
+
298
+ if (p0 < pos_chunk_start) {
299
+ return true;
300
+ }
301
+ } break;
302
+ case LLAMA_SWA_TYPE_SYMMETRIC:
303
+ {
304
+ const int32_t half_n_swa = (int32_t) n_swa / 2;
305
+ const int32_t pos_diff = p1 - p0;
306
+
307
+ // Mask if outside the symmetric window
308
+ if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
309
+ return true;
310
+ }
311
+ } break;
312
+ }
313
+
314
+ return false;
315
+ }
316
+
280
317
 
281
318
  bool use_mrope() const;
282
319
  };
@@ -852,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
852
852
  const llama_seq_id seq_id_cell = cells.seq_get(idx);
853
853
 
854
854
  // SWA mask
855
- if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
855
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
856
856
  can_use = true;
857
857
  }
858
858
  }
@@ -1237,90 +1237,236 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
1237
1237
  }
1238
1238
  }
1239
1239
 
1240
- void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1241
- const uint32_t n_tokens = ubatch->n_tokens;
1240
+ struct args_set_input_kq_mask {
1241
+ const llama_hparams & hparams;
1242
+ const llama_ubatch * ubatch;
1242
1243
 
1243
- GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1244
- float * data = (float *) dst->data;
1244
+ const std::vector<llama_kv_cells> & v_cells;
1245
+ const std::vector<uint32_t> & seq_to_stream;
1245
1246
 
1246
- const int64_t n_kv = dst->ne[0];
1247
- const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
1247
+ uint32_t n_swa;
1248
+ llama_swa_type swa_type;
1248
1249
 
1249
- GGML_ASSERT(n_tokens%n_stream == 0);
1250
+ int64_t n_kv;
1251
+ int64_t n_stream;
1252
+ int64_t n_tps;
1253
+ };
1250
1254
 
1251
- // n_tps == n_tokens_per_stream
1252
- const int64_t n_tps = n_tokens/n_stream;
1255
+ template<bool causal, bool swa, bool is_2d, bool alibi>
1256
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1257
+ //const auto & hparams = args.hparams;
1258
+ const auto & ubatch = args.ubatch;
1253
1259
 
1254
- std::fill(data, data + ggml_nelements(dst), -INFINITY);
1255
-
1256
- // Use only the previous KV cells of the correct sequence for each token of the ubatch.
1257
- // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
1258
- // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
1259
- // Causal mask:
1260
- // xxx-------
1261
- // xxxx------
1262
- // xxxxx-----
1263
- // Non-causal mask:
1264
- // xxxxx-----
1265
- // xxxxx-----
1266
- // xxxxx-----
1267
- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
1268
- // TODO: optimize this section
1269
- for (uint32_t h = 0; h < 1; ++h) {
1270
- for (uint32_t s = 0; s < n_stream; ++s) {
1271
- for (uint32_t ii = 0; ii < n_tps; ++ii) {
1272
- const uint32_t i = s*n_tps + ii;
1260
+ const auto & v_cells = args.v_cells;
1261
+ const auto & seq_to_stream = args.seq_to_stream;
1273
1262
 
1274
- const llama_seq_id seq_id = ubatch->seq_id[i][0];
1263
+ const uint32_t n_swa = args.n_swa;
1264
+ const llama_swa_type swa_type = args.swa_type;
1275
1265
 
1276
- const auto & cells = v_cells[seq_to_stream[seq_id]];
1266
+ const int64_t n_kv = args.n_kv;
1267
+ const int64_t n_stream = args.n_stream;
1268
+ const int64_t n_tps = args.n_tps;
1277
1269
 
1278
- const llama_pos p1 = ubatch->pos[i];
1270
+ // the min position in the batch for each sequence
1271
+ llama_pos seq_pos_min[LLAMA_MAX_SEQ];
1272
+ std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
1279
1273
 
1280
- // for M-RoPE
1281
- const bool is_2d = ubatch->is_pos_2d();
1282
- const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
1283
- const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
1274
+ for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
1275
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
1284
1276
 
1285
- const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
1277
+ seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
1278
+ }
1286
1279
 
1287
- for (uint32_t j = 0; j < n_kv; ++j) {
1288
- if (cells.is_empty(j)) {
1289
- continue;
1290
- }
1280
+ for (uint32_t s = 0; s < n_stream; ++s) {
1281
+ // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
1282
+ std::unordered_map<llama_seq_id, uint32_t> seq_srct;
1283
+ std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
1284
+
1285
+ for (uint32_t ii = 0; ii < n_tps; ++ii) {
1286
+ const uint32_t i = s*n_tps + ii;
1287
+
1288
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
1289
+
1290
+ const auto & cells = v_cells.at(seq_to_stream[seq_id]);
1291
+
1292
+ llama_pos p0 = -1;
1293
+ const llama_pos p1 = ubatch->pos[i];
1294
+
1295
+ // for M-RoPE
1296
+ const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
1297
+ const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
1298
+
1299
+ const uint64_t idst = n_kv*i;
1300
+
1301
+ // for tokens of the same sequence, the mask is mostly the same, so we can reuse it
1302
+ // the only cells that could change are the ones that are with similar positions as the
1303
+ // ones in the batch (i.e. due to causal masking, SWA, etc.)
1304
+ // keep track of those cells and shortcut the loop to save time
1305
+ // note: this optimization is not compatible with Alibi position encoding
1306
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18842
1307
+ bool prev = false;
1291
1308
 
1292
- // mask the token if not the same sequence
1293
- if (!cells.seq_has(j, seq_id)) {
1294
- continue;
1309
+ auto & idxs = seq_idxs[seq_id];
1310
+
1311
+ if (!alibi) {
1312
+ if (seq_srct.find(seq_id) != seq_srct.end()) {
1313
+ const uint32_t srct = seq_srct[seq_id];
1314
+
1315
+ const uint64_t idst_prev = n_kv*srct;
1316
+
1317
+ std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
1318
+
1319
+ prev = true;
1320
+ } else {
1321
+ idxs.clear();
1322
+ idxs.reserve(ubatch->n_tokens + n_swa + 32);
1323
+
1324
+ seq_srct[seq_id] = i;
1325
+ }
1326
+ }
1327
+
1328
+ for (uint32_t jj = 0; jj < n_kv; ++jj) {
1329
+ uint32_t j = jj;
1330
+
1331
+ // we have an exiting mask for this sequence -> update just seq_idxs
1332
+ if (!alibi) {
1333
+ if (prev) {
1334
+ if (jj >= idxs.size()) {
1335
+ break;
1336
+ }
1337
+
1338
+ j = idxs[jj];
1295
1339
  }
1340
+ }
1341
+
1342
+ if (cells.is_empty(j)) {
1343
+ goto skip;
1344
+ }
1345
+
1346
+ // mask the token if not the same sequence
1347
+ if (!cells.seq_has(j, seq_id)) {
1348
+ goto skip;
1349
+ }
1350
+
1351
+ p0 = cells.pos_get(j);
1296
1352
 
1297
- const llama_pos p0 = cells.pos_get(j);
1353
+ if (!alibi) {
1354
+ if (!prev) {
1355
+ // record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
1356
+ if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
1357
+ idxs.push_back(j);
1358
+ }
1359
+ }
1360
+ }
1298
1361
 
1362
+ if (causal) {
1299
1363
  // mask future tokens
1300
- if (causal_attn && p0 > p1) {
1301
- continue;
1364
+ if (p0 > p1) {
1365
+ goto skip;
1302
1366
  }
1303
1367
 
1304
1368
  // M-RoPE causal mask
1305
- if (causal_attn && is_2d && p0 == p1) {
1306
- const auto & p0_ext = cells.ext_get(j);
1307
- if (p0_ext.is_2d_gt(p1_x, p1_y)) {
1308
- continue;
1369
+ if (is_2d) {
1370
+ if (p0 == p1) {
1371
+ const auto & p0_ext = cells.ext_get(j);
1372
+
1373
+ if (p0_ext.is_2d_gt(p1_x, p1_y)) {
1374
+ goto skip;
1375
+ }
1309
1376
  }
1310
1377
  }
1378
+ }
1311
1379
 
1312
- // apply SWA if any
1313
- if (is_masked_swa(p0, p1)) {
1314
- continue;
1380
+ // apply SWA if any
1381
+ if (swa) {
1382
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
1383
+ goto skip;
1315
1384
  }
1385
+ }
1316
1386
 
1317
- data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
1387
+ if (alibi) {
1388
+ data[idst + j] = -std::abs(p0 - p1);
1389
+ } else {
1390
+ data[idst + j] = 0.0f;
1318
1391
  }
1392
+
1393
+ continue;
1394
+ skip:
1395
+ data[idst + j] = -INFINITY;
1319
1396
  }
1320
1397
  }
1321
1398
  }
1322
1399
  }
1323
1400
 
1401
+ template<bool causal, bool swa, bool is_2d>
1402
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1403
+ const bool alibi = args.hparams.use_alibi;
1404
+ if (alibi) {
1405
+ set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
1406
+ } else {
1407
+ set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
1408
+ }
1409
+ }
1410
+
1411
+ template<bool causal, bool swa>
1412
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1413
+ const bool is_2d = args.ubatch->is_pos_2d();
1414
+ if (is_2d) {
1415
+ set_input_kq_mask_impl<causal, swa, true> (args, data);
1416
+ } else {
1417
+ set_input_kq_mask_impl<causal, swa, false>(args, data);
1418
+ }
1419
+ }
1420
+
1421
+ template<bool causal>
1422
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1423
+ const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
1424
+ if (swa) {
1425
+ set_input_kq_mask_impl<causal, true> (args, data);
1426
+ } else {
1427
+ set_input_kq_mask_impl<causal, false>(args, data);
1428
+ }
1429
+ }
1430
+
1431
+ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1432
+ const uint32_t n_tokens = ubatch->n_tokens;
1433
+
1434
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1435
+ float * data = (float *) dst->data;
1436
+
1437
+ const int64_t n_kv = dst->ne[0];
1438
+ const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
1439
+
1440
+ GGML_ASSERT(n_tokens%n_stream == 0);
1441
+
1442
+ // n_tps == n_tokens_per_stream
1443
+ const int64_t n_tps = n_tokens/n_stream;
1444
+
1445
+ //const int64_t t_start = ggml_time_us();
1446
+
1447
+ const args_set_input_kq_mask args = {
1448
+ /*.hparams =*/ hparams,
1449
+ /*.ubatch =*/ ubatch,
1450
+ /*.v_cells =*/ v_cells,
1451
+ /*.seq_to_stream =*/ seq_to_stream,
1452
+ /*.n_swa =*/ n_swa,
1453
+ /*.swa_type =*/ swa_type,
1454
+ /*.n_kv =*/ n_kv,
1455
+ /*.n_stream =*/ n_stream,
1456
+ /*.n_tps =*/ n_tps,
1457
+ };
1458
+
1459
+ if (causal_attn) {
1460
+ set_input_kq_mask_impl<true> (args, data);
1461
+ } else {
1462
+ set_input_kq_mask_impl<false>(args, data);
1463
+ }
1464
+
1465
+ //const int64_t t_end = ggml_time_us();
1466
+
1467
+ //LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
1468
+ }
1469
+
1324
1470
  void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
1325
1471
  const int64_t n_tokens = ubatch->n_tokens;
1326
1472
 
@@ -1483,10 +1629,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
1483
1629
  return gf;
1484
1630
  }
1485
1631
 
1486
- bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1487
- return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
1488
- }
1489
-
1490
1632
  void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1491
1633
  GGML_UNUSED(flags);
1492
1634
 
@@ -257,8 +257,6 @@ private:
257
257
  size_t size_k_bytes() const;
258
258
  size_t size_v_bytes() const;
259
259
 
260
- bool is_masked_swa(llama_pos p0, llama_pos p1) const;
261
-
262
260
  ggml_tensor * build_rope_shift(
263
261
  const llama_cparams & cparams,
264
262
  ggml_context * ctx,
@@ -265,7 +265,8 @@ struct llama_file::impl {
265
265
  continue; // Interrupted by signal, retry
266
266
  }
267
267
  // Fallback to std::fread in case the DMA controller cannot access the buffer
268
- if (errno == EFAULT) {
268
+ if (errno == EFAULT || errno == EINVAL) {
269
+ LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
269
270
  auto curr_off = tell();
270
271
  close(fd);
271
272
  fd = -1;
@@ -384,6 +385,9 @@ int llama_file::file_id() const {
384
385
  #ifdef _WIN32
385
386
  return _fileno(pimpl->fp);
386
387
  #else
388
+ if (pimpl->fd != -1) {
389
+ return pimpl->fd;
390
+ }
387
391
  #if defined(fileno)
388
392
  return fileno(pimpl->fp);
389
393
  #else
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "ggml.h"
4
4
 
5
+ #include <algorithm>
5
6
  #include <array>
6
7
  #include <cinttypes>
7
8
  #include <cstring>
@@ -344,6 +345,7 @@ namespace GGUFMeta {
344
345
  GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
345
346
 
346
347
  switch (arr_info.gt) {
348
+ case GGUF_TYPE_BOOL:
347
349
  case GGUF_TYPE_UINT32:
348
350
  case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
349
351
  (std::is_same<T, uint32_t>::value)); break;
@@ -365,7 +367,13 @@ namespace GGUFMeta {
365
367
  result[i] = value;
366
368
  }
367
369
  } else {
368
- std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
370
+ if (arr_info.gt == GGUF_TYPE_BOOL) {
371
+ std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
372
+ return static_cast<T>(x);
373
+ });
374
+ } else {
375
+ std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
376
+ }
369
377
  }
370
378
 
371
379
  return true;
@@ -531,12 +539,18 @@ llama_model_loader::llama_model_loader(
531
539
  files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
532
540
  contexts.emplace_back(ctx);
533
541
 
534
- use_direct_io = use_direct_io && files.back()->has_direct_io();
535
-
536
- // Disable mmap in case Direct I/O is enabled and available
537
- if (use_direct_io && use_mmap) {
538
- use_mmap = false;
539
- LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
542
+ if (use_mmap && use_direct_io) {
543
+ if (files.back()->has_direct_io()) {
544
+ // Disable mmap, as DirectIO is available
545
+ use_mmap = false;
546
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
547
+ } else {
548
+ // Disable DirectIO and reopen file using std::fopen for mmap
549
+ use_direct_io = false;
550
+ files.pop_back();
551
+ files.emplace_back(new llama_file(fname.c_str(), "rb", false));
552
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
553
+ }
540
554
  }
541
555
 
542
556
  // Save tensors data offset of the main file.
@@ -468,7 +468,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
468
468
  pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
469
469
  }
470
470
 
471
- llama_model::~llama_model() = default;
471
+ llama_model::~llama_model() {
472
+ for (auto * lora : loras) {
473
+ delete lora;
474
+ }
475
+ }
472
476
 
473
477
  void llama_model::load_stats(llama_model_loader & ml) {
474
478
  pimpl->n_elements = ml.n_elements;
@@ -11,6 +11,7 @@
11
11
  #include <memory>
12
12
  #include <string>
13
13
  #include <unordered_map>
14
+ #include <unordered_set>
14
15
  #include <vector>
15
16
 
16
17
  struct llama_cparams;
@@ -476,8 +477,8 @@ struct llama_model {
476
477
  // for quantize-stats only
477
478
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
478
479
 
479
- // for keeping track of extra nodes used by lora adapters
480
- uint32_t n_lora_nodes = 0;
480
+ // for keeping track of associated LoRA adapters
481
+ std::unordered_set<llama_adapter_lora *> loras;
481
482
 
482
483
  int64_t t_load_us = 0;
483
484
  int64_t t_start_us = 0;