@fugood/llama.node 1.1.10 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +20 -2
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +174 -388
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +67 -37
- package/src/llama.cpp/common/chat.cpp +263 -2
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +5 -2
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
- package/src/llama.cpp/include/llama.h +32 -7
- package/src/llama.cpp/src/llama-adapter.cpp +101 -4
- package/src/llama.cpp/src/llama-adapter.h +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +69 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-context.cpp +92 -45
- package/src/llama.cpp/src/llama-context.h +1 -5
- package/src/llama.cpp/src/llama-graph.cpp +74 -19
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
- package/src/llama.cpp/src/llama-kv-cache.h +4 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +434 -21
- package/src/llama.cpp/src/llama-model.h +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -258,6 +258,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
|
258
258
|
}
|
|
259
259
|
}
|
|
260
260
|
|
|
261
|
+
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
|
262
|
+
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
|
263
|
+
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
|
|
264
|
+
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
|
|
265
|
+
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
|
|
266
|
+
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
|
|
267
|
+
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
|
268
|
+
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
|
269
|
+
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
|
270
|
+
|
|
271
|
+
LLAMA_LOG_DEBUG(" ");
|
|
272
|
+
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
|
273
|
+
LLAMA_LOG_DEBUG("%2d", j);
|
|
274
|
+
}
|
|
275
|
+
LLAMA_LOG_DEBUG("\n");
|
|
276
|
+
|
|
277
|
+
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
|
|
278
|
+
LLAMA_LOG_DEBUG(" %2d ", i);
|
|
279
|
+
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
|
280
|
+
float val = data[i * n_kv + j];
|
|
281
|
+
if (val == -INFINITY) {
|
|
282
|
+
LLAMA_LOG_DEBUG(" ∞");
|
|
283
|
+
} else {
|
|
284
|
+
LLAMA_LOG_DEBUG(" 0");
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
LLAMA_LOG_DEBUG("\n");
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
261
291
|
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
262
292
|
const int64_t n_kv = ubatch->n_tokens;
|
|
263
293
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
@@ -267,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
267
297
|
|
|
268
298
|
float * data = (float *) kq_mask->data;
|
|
269
299
|
|
|
300
|
+
// [TAG_NO_CACHE_ISWA]
|
|
301
|
+
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
|
302
|
+
|
|
270
303
|
for (int h = 0; h < 1; ++h) {
|
|
271
304
|
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
272
305
|
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
@@ -277,21 +310,33 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
277
310
|
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
|
278
311
|
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
|
279
312
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
if (hparams.use_alibi) {
|
|
283
|
-
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
284
|
-
} else {
|
|
285
|
-
f = 0.0f;
|
|
286
|
-
}
|
|
287
|
-
break;
|
|
313
|
+
if (s0 != s1) {
|
|
314
|
+
continue; // skip different sequences
|
|
288
315
|
}
|
|
289
|
-
}
|
|
290
316
|
|
|
317
|
+
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
|
|
318
|
+
continue; // skip future tokens for causal attention
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
|
|
322
|
+
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
|
|
323
|
+
// continue; // skip masked tokens for SWA
|
|
324
|
+
//}
|
|
325
|
+
|
|
326
|
+
// TODO: reimplement this like in llama_kv_cache_unified
|
|
327
|
+
if (hparams.use_alibi) {
|
|
328
|
+
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
329
|
+
} else {
|
|
330
|
+
f = 0.0f;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
291
333
|
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
|
292
334
|
}
|
|
293
335
|
}
|
|
294
336
|
}
|
|
337
|
+
if (debug) {
|
|
338
|
+
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
|
339
|
+
}
|
|
295
340
|
}
|
|
296
341
|
|
|
297
342
|
void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
|
@@ -314,8 +359,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
|
|
314
359
|
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
|
315
360
|
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
|
316
361
|
|
|
317
|
-
res &= mctx->get_supports_set_rows(); // TODO: tmp
|
|
318
|
-
|
|
319
362
|
return res;
|
|
320
363
|
}
|
|
321
364
|
|
|
@@ -350,8 +393,6 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|
|
350
393
|
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
|
351
394
|
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
|
352
395
|
|
|
353
|
-
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
|
|
354
|
-
|
|
355
396
|
return res;
|
|
356
397
|
}
|
|
357
398
|
|
|
@@ -1225,7 +1266,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1225
1266
|
ggml_tensor * kq_mask,
|
|
1226
1267
|
ggml_tensor * sinks,
|
|
1227
1268
|
ggml_tensor * v_mla,
|
|
1228
|
-
|
|
1269
|
+
float kq_scale,
|
|
1270
|
+
int il) const {
|
|
1229
1271
|
const bool v_trans = v->nb[1] > v->nb[2];
|
|
1230
1272
|
|
|
1231
1273
|
// split the batch into streams if needed
|
|
@@ -1260,6 +1302,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1260
1302
|
|
|
1261
1303
|
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
|
1262
1304
|
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
|
1305
|
+
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
|
1263
1306
|
|
|
1264
1307
|
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
|
1265
1308
|
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
|
@@ -1275,6 +1318,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1275
1318
|
// The permutations are noops and only change how the tensor data is interpreted.
|
|
1276
1319
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
1277
1320
|
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
|
1321
|
+
cb(cur, "fattn_mla", il);
|
|
1278
1322
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
1279
1323
|
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
|
1280
1324
|
#endif
|
|
@@ -1283,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1283
1327
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
|
1284
1328
|
} else {
|
|
1285
1329
|
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
1330
|
+
cb(kq, "kq", il);
|
|
1286
1331
|
|
|
1287
1332
|
// note: this op tends to require high floating point range
|
|
1288
1333
|
// while for some models F16 is enough, for others it is not, so we default to F32 here
|
|
@@ -1296,32 +1341,42 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1296
1341
|
// before the softmax below
|
|
1297
1342
|
|
|
1298
1343
|
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
|
|
1344
|
+
cb(kq, "kq_tanh", il);
|
|
1299
1345
|
kq = ggml_scale(ctx0, kq, 30);
|
|
1346
|
+
cb(kq, "kq_scaled", il);
|
|
1300
1347
|
}
|
|
1301
1348
|
|
|
1302
1349
|
if (hparams.attn_soft_cap) {
|
|
1303
1350
|
kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
|
|
1351
|
+
cb(kq, "kq_scaled_1", il);
|
|
1304
1352
|
kq = ggml_tanh (ctx0, kq);
|
|
1353
|
+
cb(kq, "kq_tanh", il);
|
|
1305
1354
|
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
|
1355
|
+
cb(kq, "kq_scaled_2", il);
|
|
1306
1356
|
}
|
|
1307
1357
|
|
|
1308
1358
|
if (kq_b) {
|
|
1309
1359
|
kq = ggml_add(ctx0, kq, kq_b);
|
|
1360
|
+
cb(kq, "kq_plus_kq_b", il);
|
|
1310
1361
|
}
|
|
1311
1362
|
|
|
1312
1363
|
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
1313
1364
|
ggml_soft_max_add_sinks(kq, sinks);
|
|
1365
|
+
cb(kq, "kq_soft_max", il);
|
|
1314
1366
|
|
|
1315
1367
|
if (!v_trans) {
|
|
1316
1368
|
// note: avoid this branch
|
|
1317
1369
|
v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
|
1370
|
+
cb(v, "v_cont", il);
|
|
1318
1371
|
}
|
|
1319
1372
|
|
|
1320
1373
|
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
|
1374
|
+
cb(kqv, "kqv", il);
|
|
1321
1375
|
|
|
1322
1376
|
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
|
|
1323
1377
|
if (v_mla) {
|
|
1324
1378
|
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
|
1379
|
+
cb(kqv, "kqv_mla", il);
|
|
1325
1380
|
}
|
|
1326
1381
|
|
|
1327
1382
|
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
|
@@ -1376,13 +1431,13 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1376
1431
|
|
|
1377
1432
|
// [TAG_NO_CACHE_PAD]
|
|
1378
1433
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
|
1379
|
-
assert(!ubatch.equal_seqs());
|
|
1434
|
+
assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
|
1380
1435
|
|
|
1381
1436
|
ggml_tensor * q = q_cur;
|
|
1382
1437
|
ggml_tensor * k = k_cur;
|
|
1383
1438
|
ggml_tensor * v = v_cur;
|
|
1384
1439
|
|
|
1385
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
|
1440
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
|
1386
1441
|
cb(cur, "kqv_out", il);
|
|
1387
1442
|
|
|
1388
1443
|
if (wo) {
|
|
@@ -1471,7 +1526,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1471
1526
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
|
1472
1527
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
|
1473
1528
|
|
|
1474
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
|
1529
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
|
1475
1530
|
cb(cur, "kqv_out", il);
|
|
1476
1531
|
|
|
1477
1532
|
if (wo) {
|
|
@@ -1538,7 +1593,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1538
1593
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
|
1539
1594
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
|
1540
1595
|
|
|
1541
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
|
1596
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
|
1542
1597
|
cb(cur, "kqv_out", il);
|
|
1543
1598
|
|
|
1544
1599
|
if (wo) {
|
|
@@ -1593,7 +1648,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1593
1648
|
ggml_tensor * k = k_cur;
|
|
1594
1649
|
ggml_tensor * v = v_cur;
|
|
1595
1650
|
|
|
1596
|
-
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
|
|
1651
|
+
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
|
1597
1652
|
cb(cur, "kqv_out", il);
|
|
1598
1653
|
|
|
1599
1654
|
if (wo) {
|
|
@@ -78,6 +78,11 @@ struct llm_graph_params;
|
|
|
78
78
|
|
|
79
79
|
class llm_graph_input_i {
|
|
80
80
|
public:
|
|
81
|
+
llm_graph_input_i() {
|
|
82
|
+
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
|
|
83
|
+
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
|
|
84
|
+
}
|
|
85
|
+
|
|
81
86
|
virtual ~llm_graph_input_i() = default;
|
|
82
87
|
|
|
83
88
|
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
|
@@ -90,6 +95,9 @@ public:
|
|
|
90
95
|
GGML_UNUSED(params);
|
|
91
96
|
return false;
|
|
92
97
|
}
|
|
98
|
+
protected:
|
|
99
|
+
// env: LLAMA_GRAPH_INPUT_DEBUG
|
|
100
|
+
int debug = 0;
|
|
93
101
|
};
|
|
94
102
|
|
|
95
103
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|
@@ -687,7 +695,8 @@ struct llm_graph_context {
|
|
|
687
695
|
ggml_tensor * kq_mask,
|
|
688
696
|
ggml_tensor * sinks, // [n_head_q]
|
|
689
697
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
690
|
-
float kq_scale
|
|
698
|
+
float kq_scale,
|
|
699
|
+
int il) const;
|
|
691
700
|
|
|
692
701
|
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
|
693
702
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "llama-hparams.h"
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
#include <cassert>
|
|
4
5
|
|
|
5
6
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
|
6
7
|
if (dense_first) {
|
|
@@ -178,3 +179,39 @@ uint32_t llama_hparams::n_layer_kv() const {
|
|
|
178
179
|
|
|
179
180
|
return res;
|
|
180
181
|
}
|
|
182
|
+
|
|
183
|
+
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
184
|
+
assert(p0 >= 0 && p1 >= 0);
|
|
185
|
+
|
|
186
|
+
switch (swa_type) {
|
|
187
|
+
case LLAMA_SWA_TYPE_NONE:
|
|
188
|
+
{
|
|
189
|
+
} break;
|
|
190
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
|
191
|
+
{
|
|
192
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
|
193
|
+
return true;
|
|
194
|
+
}
|
|
195
|
+
} break;
|
|
196
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
|
197
|
+
{
|
|
198
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
199
|
+
|
|
200
|
+
if (p0 < pos_chunk_start) {
|
|
201
|
+
return true;
|
|
202
|
+
}
|
|
203
|
+
} break;
|
|
204
|
+
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
205
|
+
{
|
|
206
|
+
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
207
|
+
const int32_t pos_diff = p1 - p0;
|
|
208
|
+
|
|
209
|
+
// Mask if outside the symmetric window
|
|
210
|
+
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
211
|
+
return true;
|
|
212
|
+
}
|
|
213
|
+
} break;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return false;
|
|
217
|
+
}
|
|
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
|
|
|
16
16
|
};
|
|
17
17
|
|
|
18
18
|
enum llama_swa_type {
|
|
19
|
-
LLAMA_SWA_TYPE_NONE
|
|
20
|
-
LLAMA_SWA_TYPE_STANDARD
|
|
21
|
-
LLAMA_SWA_TYPE_CHUNKED
|
|
19
|
+
LLAMA_SWA_TYPE_NONE = 0,
|
|
20
|
+
LLAMA_SWA_TYPE_STANDARD = 1,
|
|
21
|
+
LLAMA_SWA_TYPE_CHUNKED = 2,
|
|
22
|
+
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
|
22
23
|
};
|
|
23
24
|
|
|
24
25
|
struct llama_hparams_posnet {
|
|
@@ -227,6 +228,11 @@ struct llama_hparams {
|
|
|
227
228
|
|
|
228
229
|
// number of layers for which has_kv() returns true
|
|
229
230
|
uint32_t n_layer_kv() const;
|
|
231
|
+
|
|
232
|
+
// note that this function uses different SWA parameters from those in the hparams
|
|
233
|
+
// TODO: think of a better place for this function
|
|
234
|
+
// TODO: pack the SWA params in a struct?
|
|
235
|
+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
|
230
236
|
};
|
|
231
237
|
|
|
232
238
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
|
@@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
|
|
|
59
59
|
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
|
60
60
|
|
|
61
61
|
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
|
62
|
+
|
|
63
|
+
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
|
@@ -197,18 +197,6 @@ llama_kv_cache::llama_kv_cache(
|
|
|
197
197
|
|
|
198
198
|
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
|
199
199
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
|
200
|
-
|
|
201
|
-
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
202
|
-
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
|
203
|
-
|
|
204
|
-
if (!supports_set_rows) {
|
|
205
|
-
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
206
|
-
GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
if (!supports_set_rows) {
|
|
210
|
-
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
|
|
211
|
-
}
|
|
212
200
|
}
|
|
213
201
|
|
|
214
202
|
void llama_kv_cache::clear(bool data) {
|
|
@@ -551,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
|
|
|
551
539
|
bool success = true;
|
|
552
540
|
|
|
553
541
|
for (const auto & ubatch : ubatches) {
|
|
554
|
-
// non-continuous slots require support for ggml_set_rows()
|
|
555
|
-
const bool cont = supports_set_rows ? false : true;
|
|
556
|
-
|
|
557
542
|
// only find a suitable slot for the ubatch. don't modify the cells yet
|
|
558
|
-
const auto sinfo_new = find_slot(ubatch,
|
|
543
|
+
const auto sinfo_new = find_slot(ubatch, false);
|
|
559
544
|
if (sinfo_new.empty()) {
|
|
560
545
|
success = false;
|
|
561
546
|
break;
|
|
@@ -771,8 +756,8 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
|
771
756
|
GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
|
|
772
757
|
}
|
|
773
758
|
|
|
774
|
-
res.s0 = std::min<
|
|
775
|
-
res.s1 = std::max<
|
|
759
|
+
res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
|
|
760
|
+
res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);
|
|
776
761
|
|
|
777
762
|
res.strm[s] = seq_to_stream[seq_id];
|
|
778
763
|
res.idxs[s].reserve(n_tokens);
|
|
@@ -964,11 +949,11 @@ bool llama_kv_cache::get_has_shift() const {
|
|
|
964
949
|
return result;
|
|
965
950
|
}
|
|
966
951
|
|
|
967
|
-
uint32_t llama_kv_cache::get_n_kv() const {
|
|
952
|
+
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
|
968
953
|
uint32_t result = 0;
|
|
969
954
|
|
|
970
|
-
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
971
|
-
const auto & cells = v_cells[s];
|
|
955
|
+
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
|
|
956
|
+
const auto & cells = v_cells[sinfo.strm[s]];
|
|
972
957
|
|
|
973
958
|
result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
|
|
974
959
|
}
|
|
@@ -976,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv() const {
|
|
|
976
961
|
return result;
|
|
977
962
|
}
|
|
978
963
|
|
|
979
|
-
bool llama_kv_cache::get_supports_set_rows() const {
|
|
980
|
-
return supports_set_rows;
|
|
981
|
-
}
|
|
982
|
-
|
|
983
964
|
ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
|
|
984
965
|
const int32_t ikv = map_layer_ids.at(il);
|
|
985
966
|
|
|
@@ -1017,52 +998,42 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|
|
1017
998
|
// note: v->nb[1] <= v->nb[2]
|
|
1018
999
|
return ggml_view_4d(ctx, v,
|
|
1019
1000
|
hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
|
|
1020
|
-
ggml_row_size(v->type, hparams.n_embd_head_v),
|
|
1021
|
-
ggml_row_size(v->type, n_embd_v_gqa),
|
|
1022
|
-
ggml_row_size(v->type, n_embd_v_gqa*kv_size),
|
|
1001
|
+
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
|
|
1002
|
+
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
|
1003
|
+
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
|
1023
1004
|
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
|
|
1024
1005
|
}
|
|
1025
1006
|
|
|
1026
1007
|
// note: v->nb[1] > v->nb[2]
|
|
1027
1008
|
return ggml_view_4d(ctx, v,
|
|
1028
1009
|
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
|
|
1029
|
-
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v),
|
|
1030
|
-
ggml_row_size(v->type, kv_size),
|
|
1031
|
-
ggml_row_size(v->type, kv_size*n_embd_v_gqa),
|
|
1010
|
+
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
|
|
1011
|
+
ggml_row_size(v->type, kv_size), // v->nb[2]
|
|
1012
|
+
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
|
1032
1013
|
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
|
1033
1014
|
}
|
|
1034
1015
|
|
|
1035
1016
|
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1017
|
+
GGML_UNUSED(sinfo);
|
|
1018
|
+
|
|
1036
1019
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1037
1020
|
|
|
1038
1021
|
auto * k = layers[ikv].k;
|
|
1039
1022
|
|
|
1040
|
-
const int64_t n_embd_k_gqa = k->ne[0];
|
|
1041
1023
|
const int64_t n_tokens = k_cur->ne[2];
|
|
1042
1024
|
|
|
1043
1025
|
k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
|
|
1044
1026
|
|
|
1045
|
-
if (
|
|
1046
|
-
|
|
1047
|
-
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
|
|
1048
|
-
}
|
|
1049
|
-
|
|
1050
|
-
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1027
|
+
if (k->ne[2] > 1) {
|
|
1028
|
+
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
|
|
1051
1029
|
}
|
|
1052
1030
|
|
|
1053
|
-
|
|
1054
|
-
// will be removed when ggml_set_rows() is adopted by all backends
|
|
1055
|
-
|
|
1056
|
-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
|
|
1057
|
-
|
|
1058
|
-
ggml_tensor * k_view = ggml_view_1d(ctx, k,
|
|
1059
|
-
n_tokens*n_embd_k_gqa,
|
|
1060
|
-
ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
|
|
1061
|
-
|
|
1062
|
-
return ggml_cpy(ctx, k_cur, k_view);
|
|
1031
|
+
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1063
1032
|
}
|
|
1064
1033
|
|
|
1065
1034
|
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1035
|
+
GGML_UNUSED(sinfo);
|
|
1036
|
+
|
|
1066
1037
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1067
1038
|
|
|
1068
1039
|
auto * v = layers[ikv].v;
|
|
@@ -1072,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
|
|
1072
1043
|
|
|
1073
1044
|
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
|
|
1074
1045
|
|
|
1075
|
-
if (
|
|
1076
|
-
if (
|
|
1077
|
-
|
|
1078
|
-
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
|
|
1079
|
-
}
|
|
1080
|
-
|
|
1081
|
-
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1082
|
-
}
|
|
1083
|
-
|
|
1084
|
-
// [TAG_V_CACHE_VARIABLE]
|
|
1085
|
-
if (n_embd_v_gqa < v->ne[0]) {
|
|
1086
|
-
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
|
|
1046
|
+
if (!v_trans) {
|
|
1047
|
+
if (v->ne[2] > 1) {
|
|
1048
|
+
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
|
|
1087
1049
|
}
|
|
1088
1050
|
|
|
1089
|
-
|
|
1090
|
-
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
|
|
1091
|
-
|
|
1092
|
-
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
|
|
1093
|
-
|
|
1094
|
-
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1051
|
+
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1095
1052
|
}
|
|
1096
1053
|
|
|
1097
|
-
//
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1054
|
+
// [TAG_V_CACHE_VARIABLE]
|
|
1055
|
+
if (n_embd_v_gqa < v->ne[0]) {
|
|
1056
|
+
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
|
|
1057
|
+
}
|
|
1101
1058
|
|
|
1102
|
-
|
|
1059
|
+
// the row becomes a single element
|
|
1060
|
+
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
|
|
1103
1061
|
|
|
1104
|
-
|
|
1105
|
-
v_view = ggml_view_1d(ctx, v,
|
|
1106
|
-
n_tokens*n_embd_v_gqa,
|
|
1107
|
-
ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
|
|
1108
|
-
} else {
|
|
1109
|
-
v_cur = ggml_transpose(ctx, v_cur);
|
|
1110
|
-
|
|
1111
|
-
v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
|
|
1112
|
-
(v->ne[1] )*ggml_element_size(v),
|
|
1113
|
-
(sinfo.head())*ggml_element_size(v));
|
|
1114
|
-
}
|
|
1062
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
|
|
1115
1063
|
|
|
1116
|
-
return
|
|
1064
|
+
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1117
1065
|
}
|
|
1118
1066
|
|
|
1119
1067
|
ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
@@ -1143,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
|
|
|
1143
1091
|
}
|
|
1144
1092
|
|
|
1145
1093
|
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1146
|
-
if (!supports_set_rows) {
|
|
1147
|
-
return;
|
|
1148
|
-
}
|
|
1149
|
-
|
|
1150
1094
|
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1151
1095
|
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
|
1152
1096
|
|
|
@@ -1163,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
|
|
|
1163
1107
|
}
|
|
1164
1108
|
|
|
1165
1109
|
void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1166
|
-
if (!supports_set_rows) {
|
|
1167
|
-
return;
|
|
1168
|
-
}
|
|
1169
|
-
|
|
1170
1110
|
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1171
1111
|
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
|
1172
1112
|
|
|
@@ -1453,29 +1393,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|
|
1453
1393
|
}
|
|
1454
1394
|
|
|
1455
1395
|
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
switch (swa_type) {
|
|
1459
|
-
case LLAMA_SWA_TYPE_NONE:
|
|
1460
|
-
{
|
|
1461
|
-
} break;
|
|
1462
|
-
case LLAMA_SWA_TYPE_STANDARD:
|
|
1463
|
-
{
|
|
1464
|
-
if (p1 - p0 >= (int32_t) n_swa) {
|
|
1465
|
-
return true;
|
|
1466
|
-
}
|
|
1467
|
-
} break;
|
|
1468
|
-
case LLAMA_SWA_TYPE_CHUNKED:
|
|
1469
|
-
{
|
|
1470
|
-
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
1471
|
-
|
|
1472
|
-
if (p0 < pos_chunk_start) {
|
|
1473
|
-
return true;
|
|
1474
|
-
}
|
|
1475
|
-
} break;
|
|
1476
|
-
}
|
|
1477
|
-
|
|
1478
|
-
return false;
|
|
1396
|
+
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
|
1479
1397
|
}
|
|
1480
1398
|
|
|
1481
1399
|
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
@@ -1985,8 +1903,7 @@ bool llama_kv_cache_context::apply() {
|
|
|
1985
1903
|
}
|
|
1986
1904
|
|
|
1987
1905
|
kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
|
|
1988
|
-
|
|
1989
|
-
n_kv = kv->get_n_kv();
|
|
1906
|
+
n_kv = kv->get_n_kv(sinfos[i_cur]);
|
|
1990
1907
|
|
|
1991
1908
|
return true;
|
|
1992
1909
|
}
|
|
@@ -2005,10 +1922,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
|
|
|
2005
1922
|
return n_kv;
|
|
2006
1923
|
}
|
|
2007
1924
|
|
|
2008
|
-
bool llama_kv_cache_context::get_supports_set_rows() const {
|
|
2009
|
-
return kv->get_supports_set_rows();
|
|
2010
|
-
}
|
|
2011
|
-
|
|
2012
1925
|
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
|
2013
1926
|
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
|
2014
1927
|
}
|
|
@@ -38,8 +38,8 @@ public:
|
|
|
38
38
|
using idx_vec_t = std::vector<uint32_t>;
|
|
39
39
|
|
|
40
40
|
// number of streams: ns = s1 - s0 + 1
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
uint32_t s0;
|
|
42
|
+
uint32_t s1;
|
|
43
43
|
|
|
44
44
|
std::vector<llama_seq_id> strm; // [ns]
|
|
45
45
|
std::vector<idx_vec_t> idxs; // [ns]
|
|
@@ -139,10 +139,7 @@ public:
|
|
|
139
139
|
// graph_build API
|
|
140
140
|
//
|
|
141
141
|
|
|
142
|
-
uint32_t get_n_kv() const;
|
|
143
|
-
|
|
144
|
-
// TODO: temporary
|
|
145
|
-
bool get_supports_set_rows() const;
|
|
142
|
+
uint32_t get_n_kv(const slot_info & sinfo) const;
|
|
146
143
|
|
|
147
144
|
// get views of the current state of the cache
|
|
148
145
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
|
@@ -215,10 +212,7 @@ private:
|
|
|
215
212
|
// env: LLAMA_KV_CACHE_DEBUG
|
|
216
213
|
int debug = 0;
|
|
217
214
|
|
|
218
|
-
//
|
|
219
|
-
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
220
|
-
bool supports_set_rows = true;
|
|
221
|
-
|
|
215
|
+
// this is the SWA type of the cache - not to be confused with the model SWA type
|
|
222
216
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
223
217
|
|
|
224
218
|
std::vector<ggml_context_ptr> ctxs;
|
|
@@ -318,9 +312,6 @@ public:
|
|
|
318
312
|
|
|
319
313
|
uint32_t get_n_kv() const;
|
|
320
314
|
|
|
321
|
-
// TODO: temporary
|
|
322
|
-
bool get_supports_set_rows() const;
|
|
323
|
-
|
|
324
315
|
// get views of the current state of the cache
|
|
325
316
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
|
326
317
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
@@ -788,6 +788,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|
|
788
788
|
}
|
|
789
789
|
|
|
790
790
|
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
|
791
|
+
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
|
791
792
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
|
792
793
|
|
|
793
794
|
if (cur == NULL) {
|