@fugood/llama.node 1.4.15 → 1.6.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +1 -5
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +76 -61
- package/src/LlamaContext.cpp +20 -32
- package/src/llama.cpp/common/CMakeLists.txt +12 -0
- package/src/llama.cpp/common/arg.cpp +20 -0
- package/src/llama.cpp/common/chat-parser.cpp +3 -3
- package/src/llama.cpp/common/chat-parser.h +4 -4
- package/src/llama.cpp/common/chat.cpp +289 -34
- package/src/llama.cpp/common/chat.h +32 -20
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +31 -25
- package/src/llama.cpp/common/download.cpp +19 -14
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/json-partial.h +1 -0
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +5 -1
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
5
|
#include <array>
|
|
6
|
+
#include <cassert>
|
|
6
7
|
|
|
7
8
|
// bump if necessary
|
|
8
9
|
#define LLAMA_MAX_LAYERS 512
|
|
@@ -274,9 +275,45 @@ struct llama_hparams {
|
|
|
274
275
|
uint32_t n_layer_kv() const;
|
|
275
276
|
|
|
276
277
|
// note that this function uses different SWA parameters from those in the hparams
|
|
278
|
+
// note: inlined on purpose for performance reasons
|
|
277
279
|
// TODO: think of a better place for this function
|
|
278
280
|
// TODO: pack the SWA params in a struct?
|
|
279
|
-
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1)
|
|
281
|
+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
282
|
+
assert(p0 >= 0 && p1 >= 0);
|
|
283
|
+
|
|
284
|
+
switch (swa_type) {
|
|
285
|
+
case LLAMA_SWA_TYPE_NONE:
|
|
286
|
+
{
|
|
287
|
+
} break;
|
|
288
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
|
289
|
+
{
|
|
290
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
|
291
|
+
return true;
|
|
292
|
+
}
|
|
293
|
+
} break;
|
|
294
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
|
295
|
+
{
|
|
296
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
297
|
+
|
|
298
|
+
if (p0 < pos_chunk_start) {
|
|
299
|
+
return true;
|
|
300
|
+
}
|
|
301
|
+
} break;
|
|
302
|
+
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
303
|
+
{
|
|
304
|
+
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
305
|
+
const int32_t pos_diff = p1 - p0;
|
|
306
|
+
|
|
307
|
+
// Mask if outside the symmetric window
|
|
308
|
+
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
309
|
+
return true;
|
|
310
|
+
}
|
|
311
|
+
} break;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
|
|
280
317
|
|
|
281
318
|
bool use_mrope() const;
|
|
282
319
|
};
|
|
@@ -852,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
|
852
852
|
const llama_seq_id seq_id_cell = cells.seq_get(idx);
|
|
853
853
|
|
|
854
854
|
// SWA mask
|
|
855
|
-
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
|
855
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
|
856
856
|
can_use = true;
|
|
857
857
|
}
|
|
858
858
|
}
|
|
@@ -1237,90 +1237,236 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
|
|
|
1237
1237
|
}
|
|
1238
1238
|
}
|
|
1239
1239
|
|
|
1240
|
-
|
|
1241
|
-
const
|
|
1240
|
+
struct args_set_input_kq_mask {
|
|
1241
|
+
const llama_hparams & hparams;
|
|
1242
|
+
const llama_ubatch * ubatch;
|
|
1242
1243
|
|
|
1243
|
-
|
|
1244
|
-
|
|
1244
|
+
const std::vector<llama_kv_cells> & v_cells;
|
|
1245
|
+
const std::vector<uint32_t> & seq_to_stream;
|
|
1245
1246
|
|
|
1246
|
-
|
|
1247
|
-
|
|
1247
|
+
uint32_t n_swa;
|
|
1248
|
+
llama_swa_type swa_type;
|
|
1248
1249
|
|
|
1249
|
-
|
|
1250
|
+
int64_t n_kv;
|
|
1251
|
+
int64_t n_stream;
|
|
1252
|
+
int64_t n_tps;
|
|
1253
|
+
};
|
|
1250
1254
|
|
|
1251
|
-
|
|
1252
|
-
|
|
1255
|
+
template<bool causal, bool swa, bool is_2d, bool alibi>
|
|
1256
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1257
|
+
//const auto & hparams = args.hparams;
|
|
1258
|
+
const auto & ubatch = args.ubatch;
|
|
1253
1259
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
|
1257
|
-
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
|
1258
|
-
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
|
|
1259
|
-
// Causal mask:
|
|
1260
|
-
// xxx-------
|
|
1261
|
-
// xxxx------
|
|
1262
|
-
// xxxxx-----
|
|
1263
|
-
// Non-causal mask:
|
|
1264
|
-
// xxxxx-----
|
|
1265
|
-
// xxxxx-----
|
|
1266
|
-
// xxxxx-----
|
|
1267
|
-
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
|
1268
|
-
// TODO: optimize this section
|
|
1269
|
-
for (uint32_t h = 0; h < 1; ++h) {
|
|
1270
|
-
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
1271
|
-
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
|
1272
|
-
const uint32_t i = s*n_tps + ii;
|
|
1260
|
+
const auto & v_cells = args.v_cells;
|
|
1261
|
+
const auto & seq_to_stream = args.seq_to_stream;
|
|
1273
1262
|
|
|
1274
|
-
|
|
1263
|
+
const uint32_t n_swa = args.n_swa;
|
|
1264
|
+
const llama_swa_type swa_type = args.swa_type;
|
|
1275
1265
|
|
|
1276
|
-
|
|
1266
|
+
const int64_t n_kv = args.n_kv;
|
|
1267
|
+
const int64_t n_stream = args.n_stream;
|
|
1268
|
+
const int64_t n_tps = args.n_tps;
|
|
1277
1269
|
|
|
1278
|
-
|
|
1270
|
+
// the min position in the batch for each sequence
|
|
1271
|
+
llama_pos seq_pos_min[LLAMA_MAX_SEQ];
|
|
1272
|
+
std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
|
|
1279
1273
|
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
|
1283
|
-
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
|
1274
|
+
for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
|
|
1275
|
+
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
|
1284
1276
|
|
|
1285
|
-
|
|
1277
|
+
seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
|
|
1278
|
+
}
|
|
1286
1279
|
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1280
|
+
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
1281
|
+
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
|
|
1282
|
+
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
|
|
1283
|
+
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
|
|
1284
|
+
|
|
1285
|
+
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
|
1286
|
+
const uint32_t i = s*n_tps + ii;
|
|
1287
|
+
|
|
1288
|
+
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
|
1289
|
+
|
|
1290
|
+
const auto & cells = v_cells.at(seq_to_stream[seq_id]);
|
|
1291
|
+
|
|
1292
|
+
llama_pos p0 = -1;
|
|
1293
|
+
const llama_pos p1 = ubatch->pos[i];
|
|
1294
|
+
|
|
1295
|
+
// for M-RoPE
|
|
1296
|
+
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
|
1297
|
+
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
|
1298
|
+
|
|
1299
|
+
const uint64_t idst = n_kv*i;
|
|
1300
|
+
|
|
1301
|
+
// for tokens of the same sequence, the mask is mostly the same, so we can reuse it
|
|
1302
|
+
// the only cells that could change are the ones that are with similar positions as the
|
|
1303
|
+
// ones in the batch (i.e. due to causal masking, SWA, etc.)
|
|
1304
|
+
// keep track of those cells and shortcut the loop to save time
|
|
1305
|
+
// note: this optimization is not compatible with Alibi position encoding
|
|
1306
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/18842
|
|
1307
|
+
bool prev = false;
|
|
1291
1308
|
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1309
|
+
auto & idxs = seq_idxs[seq_id];
|
|
1310
|
+
|
|
1311
|
+
if (!alibi) {
|
|
1312
|
+
if (seq_srct.find(seq_id) != seq_srct.end()) {
|
|
1313
|
+
const uint32_t srct = seq_srct[seq_id];
|
|
1314
|
+
|
|
1315
|
+
const uint64_t idst_prev = n_kv*srct;
|
|
1316
|
+
|
|
1317
|
+
std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
|
|
1318
|
+
|
|
1319
|
+
prev = true;
|
|
1320
|
+
} else {
|
|
1321
|
+
idxs.clear();
|
|
1322
|
+
idxs.reserve(ubatch->n_tokens + n_swa + 32);
|
|
1323
|
+
|
|
1324
|
+
seq_srct[seq_id] = i;
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
for (uint32_t jj = 0; jj < n_kv; ++jj) {
|
|
1329
|
+
uint32_t j = jj;
|
|
1330
|
+
|
|
1331
|
+
// we have an exiting mask for this sequence -> update just seq_idxs
|
|
1332
|
+
if (!alibi) {
|
|
1333
|
+
if (prev) {
|
|
1334
|
+
if (jj >= idxs.size()) {
|
|
1335
|
+
break;
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
j = idxs[jj];
|
|
1295
1339
|
}
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
if (cells.is_empty(j)) {
|
|
1343
|
+
goto skip;
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
// mask the token if not the same sequence
|
|
1347
|
+
if (!cells.seq_has(j, seq_id)) {
|
|
1348
|
+
goto skip;
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
p0 = cells.pos_get(j);
|
|
1296
1352
|
|
|
1297
|
-
|
|
1353
|
+
if (!alibi) {
|
|
1354
|
+
if (!prev) {
|
|
1355
|
+
// record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
|
|
1356
|
+
if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
|
|
1357
|
+
idxs.push_back(j);
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1298
1361
|
|
|
1362
|
+
if (causal) {
|
|
1299
1363
|
// mask future tokens
|
|
1300
|
-
if (
|
|
1301
|
-
|
|
1364
|
+
if (p0 > p1) {
|
|
1365
|
+
goto skip;
|
|
1302
1366
|
}
|
|
1303
1367
|
|
|
1304
1368
|
// M-RoPE causal mask
|
|
1305
|
-
if (
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1369
|
+
if (is_2d) {
|
|
1370
|
+
if (p0 == p1) {
|
|
1371
|
+
const auto & p0_ext = cells.ext_get(j);
|
|
1372
|
+
|
|
1373
|
+
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
|
|
1374
|
+
goto skip;
|
|
1375
|
+
}
|
|
1309
1376
|
}
|
|
1310
1377
|
}
|
|
1378
|
+
}
|
|
1311
1379
|
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1380
|
+
// apply SWA if any
|
|
1381
|
+
if (swa) {
|
|
1382
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
|
1383
|
+
goto skip;
|
|
1315
1384
|
}
|
|
1385
|
+
}
|
|
1316
1386
|
|
|
1317
|
-
|
|
1387
|
+
if (alibi) {
|
|
1388
|
+
data[idst + j] = -std::abs(p0 - p1);
|
|
1389
|
+
} else {
|
|
1390
|
+
data[idst + j] = 0.0f;
|
|
1318
1391
|
}
|
|
1392
|
+
|
|
1393
|
+
continue;
|
|
1394
|
+
skip:
|
|
1395
|
+
data[idst + j] = -INFINITY;
|
|
1319
1396
|
}
|
|
1320
1397
|
}
|
|
1321
1398
|
}
|
|
1322
1399
|
}
|
|
1323
1400
|
|
|
1401
|
+
template<bool causal, bool swa, bool is_2d>
|
|
1402
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1403
|
+
const bool alibi = args.hparams.use_alibi;
|
|
1404
|
+
if (alibi) {
|
|
1405
|
+
set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
|
|
1406
|
+
} else {
|
|
1407
|
+
set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
template<bool causal, bool swa>
|
|
1412
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1413
|
+
const bool is_2d = args.ubatch->is_pos_2d();
|
|
1414
|
+
if (is_2d) {
|
|
1415
|
+
set_input_kq_mask_impl<causal, swa, true> (args, data);
|
|
1416
|
+
} else {
|
|
1417
|
+
set_input_kq_mask_impl<causal, swa, false>(args, data);
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
template<bool causal>
|
|
1422
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1423
|
+
const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
|
|
1424
|
+
if (swa) {
|
|
1425
|
+
set_input_kq_mask_impl<causal, true> (args, data);
|
|
1426
|
+
} else {
|
|
1427
|
+
set_input_kq_mask_impl<causal, false>(args, data);
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
1432
|
+
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1433
|
+
|
|
1434
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
1435
|
+
float * data = (float *) dst->data;
|
|
1436
|
+
|
|
1437
|
+
const int64_t n_kv = dst->ne[0];
|
|
1438
|
+
const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
|
|
1439
|
+
|
|
1440
|
+
GGML_ASSERT(n_tokens%n_stream == 0);
|
|
1441
|
+
|
|
1442
|
+
// n_tps == n_tokens_per_stream
|
|
1443
|
+
const int64_t n_tps = n_tokens/n_stream;
|
|
1444
|
+
|
|
1445
|
+
//const int64_t t_start = ggml_time_us();
|
|
1446
|
+
|
|
1447
|
+
const args_set_input_kq_mask args = {
|
|
1448
|
+
/*.hparams =*/ hparams,
|
|
1449
|
+
/*.ubatch =*/ ubatch,
|
|
1450
|
+
/*.v_cells =*/ v_cells,
|
|
1451
|
+
/*.seq_to_stream =*/ seq_to_stream,
|
|
1452
|
+
/*.n_swa =*/ n_swa,
|
|
1453
|
+
/*.swa_type =*/ swa_type,
|
|
1454
|
+
/*.n_kv =*/ n_kv,
|
|
1455
|
+
/*.n_stream =*/ n_stream,
|
|
1456
|
+
/*.n_tps =*/ n_tps,
|
|
1457
|
+
};
|
|
1458
|
+
|
|
1459
|
+
if (causal_attn) {
|
|
1460
|
+
set_input_kq_mask_impl<true> (args, data);
|
|
1461
|
+
} else {
|
|
1462
|
+
set_input_kq_mask_impl<false>(args, data);
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
//const int64_t t_end = ggml_time_us();
|
|
1466
|
+
|
|
1467
|
+
//LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1324
1470
|
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
1325
1471
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
1326
1472
|
|
|
@@ -1483,10 +1629,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|
|
1483
1629
|
return gf;
|
|
1484
1630
|
}
|
|
1485
1631
|
|
|
1486
|
-
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1487
|
-
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
|
1488
|
-
}
|
|
1489
|
-
|
|
1490
1632
|
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
1491
1633
|
GGML_UNUSED(flags);
|
|
1492
1634
|
|
|
@@ -265,7 +265,8 @@ struct llama_file::impl {
|
|
|
265
265
|
continue; // Interrupted by signal, retry
|
|
266
266
|
}
|
|
267
267
|
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
268
|
-
if (errno == EFAULT) {
|
|
268
|
+
if (errno == EFAULT || errno == EINVAL) {
|
|
269
|
+
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
|
|
269
270
|
auto curr_off = tell();
|
|
270
271
|
close(fd);
|
|
271
272
|
fd = -1;
|
|
@@ -384,6 +385,9 @@ int llama_file::file_id() const {
|
|
|
384
385
|
#ifdef _WIN32
|
|
385
386
|
return _fileno(pimpl->fp);
|
|
386
387
|
#else
|
|
388
|
+
if (pimpl->fd != -1) {
|
|
389
|
+
return pimpl->fd;
|
|
390
|
+
}
|
|
387
391
|
#if defined(fileno)
|
|
388
392
|
return fileno(pimpl->fp);
|
|
389
393
|
#else
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
|
|
5
|
+
#include <algorithm>
|
|
5
6
|
#include <array>
|
|
6
7
|
#include <cinttypes>
|
|
7
8
|
#include <cstring>
|
|
@@ -344,6 +345,7 @@ namespace GGUFMeta {
|
|
|
344
345
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
|
|
345
346
|
|
|
346
347
|
switch (arr_info.gt) {
|
|
348
|
+
case GGUF_TYPE_BOOL:
|
|
347
349
|
case GGUF_TYPE_UINT32:
|
|
348
350
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
|
349
351
|
(std::is_same<T, uint32_t>::value)); break;
|
|
@@ -365,7 +367,13 @@ namespace GGUFMeta {
|
|
|
365
367
|
result[i] = value;
|
|
366
368
|
}
|
|
367
369
|
} else {
|
|
368
|
-
|
|
370
|
+
if (arr_info.gt == GGUF_TYPE_BOOL) {
|
|
371
|
+
std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
|
|
372
|
+
return static_cast<T>(x);
|
|
373
|
+
});
|
|
374
|
+
} else {
|
|
375
|
+
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
|
376
|
+
}
|
|
369
377
|
}
|
|
370
378
|
|
|
371
379
|
return true;
|
|
@@ -531,12 +539,18 @@ llama_model_loader::llama_model_loader(
|
|
|
531
539
|
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
532
540
|
contexts.emplace_back(ctx);
|
|
533
541
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
542
|
+
if (use_mmap && use_direct_io) {
|
|
543
|
+
if (files.back()->has_direct_io()) {
|
|
544
|
+
// Disable mmap, as DirectIO is available
|
|
545
|
+
use_mmap = false;
|
|
546
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
547
|
+
} else {
|
|
548
|
+
// Disable DirectIO and reopen file using std::fopen for mmap
|
|
549
|
+
use_direct_io = false;
|
|
550
|
+
files.pop_back();
|
|
551
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
|
552
|
+
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
|
553
|
+
}
|
|
540
554
|
}
|
|
541
555
|
|
|
542
556
|
// Save tensors data offset of the main file.
|
|
@@ -468,7 +468,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|
|
468
468
|
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
469
469
|
}
|
|
470
470
|
|
|
471
|
-
llama_model::~llama_model()
|
|
471
|
+
llama_model::~llama_model() {
|
|
472
|
+
for (auto * lora : loras) {
|
|
473
|
+
delete lora;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
472
476
|
|
|
473
477
|
void llama_model::load_stats(llama_model_loader & ml) {
|
|
474
478
|
pimpl->n_elements = ml.n_elements;
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <memory>
|
|
12
12
|
#include <string>
|
|
13
13
|
#include <unordered_map>
|
|
14
|
+
#include <unordered_set>
|
|
14
15
|
#include <vector>
|
|
15
16
|
|
|
16
17
|
struct llama_cparams;
|
|
@@ -476,8 +477,8 @@ struct llama_model {
|
|
|
476
477
|
// for quantize-stats only
|
|
477
478
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
|
478
479
|
|
|
479
|
-
// for keeping track of
|
|
480
|
-
|
|
480
|
+
// for keeping track of associated LoRA adapters
|
|
481
|
+
std::unordered_set<llama_adapter_lora *> loras;
|
|
481
482
|
|
|
482
483
|
int64_t t_load_us = 0;
|
|
483
484
|
int64_t t_start_us = 0;
|