@fugood/llama.node 0.4.7 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +20 -6
- package/lib/index.js +41 -17
- package/lib/index.ts +50 -23
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +9 -9
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +37 -18
- package/src/LlamaContext.h +1 -0
- package/src/TokenizeWorker.cpp +16 -12
- package/src/TokenizeWorker.h +2 -2
- package/src/common.hpp +54 -50
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
|
@@ -9,33 +9,6 @@
|
|
|
9
9
|
#include <cmath>
|
|
10
10
|
#include <cstring>
|
|
11
11
|
|
|
12
|
-
static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
|
|
13
|
-
// TODO move to hparams if a T5 variant appears that uses a different value
|
|
14
|
-
const int64_t max_distance = 128;
|
|
15
|
-
|
|
16
|
-
if (bidirectional) {
|
|
17
|
-
n_buckets >>= 1;
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
const int64_t max_exact = n_buckets >> 1;
|
|
21
|
-
|
|
22
|
-
int32_t relative_position = x - y;
|
|
23
|
-
int32_t relative_bucket = 0;
|
|
24
|
-
|
|
25
|
-
if (bidirectional) {
|
|
26
|
-
relative_bucket += (relative_position > 0) * n_buckets;
|
|
27
|
-
relative_position = abs(relative_position);
|
|
28
|
-
} else {
|
|
29
|
-
relative_position = -std::min<int32_t>(relative_position, 0);
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
|
|
33
|
-
relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
|
|
34
|
-
relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
|
|
35
|
-
|
|
36
|
-
return relative_bucket;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
12
|
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
|
40
13
|
if (ubatch->token) {
|
|
41
14
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
@@ -110,22 +83,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
|
|
|
110
83
|
|
|
111
84
|
void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
|
|
112
85
|
if (pos_bucket) {
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
|
|
116
|
-
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
|
|
117
|
-
|
|
118
|
-
int32_t * data = (int32_t *) pos_bucket->data;
|
|
119
|
-
|
|
120
|
-
const int64_t n_kv = kv_self->n;
|
|
121
|
-
|
|
122
|
-
for (int h = 0; h < 1; ++h) {
|
|
123
|
-
for (int j = 0; j < n_tokens; ++j) {
|
|
124
|
-
for (int i = 0; i < n_kv; ++i) {
|
|
125
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
86
|
+
kv_self->set_input_pos_bucket(pos_bucket, ubatch);
|
|
129
87
|
}
|
|
130
88
|
}
|
|
131
89
|
|
|
@@ -403,99 +361,18 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
403
361
|
}
|
|
404
362
|
|
|
405
363
|
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
|
406
|
-
if (self_kq_mask
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
const int64_t n_seqs = ubatch->n_seqs;
|
|
411
|
-
|
|
412
|
-
float * data = nullptr;
|
|
413
|
-
float * data_swa = nullptr;
|
|
414
|
-
|
|
415
|
-
if (self_kq_mask) {
|
|
416
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
|
|
417
|
-
data = (float *) self_kq_mask->data;
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
if (self_kq_mask_swa) {
|
|
421
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
|
|
422
|
-
data_swa = (float *) self_kq_mask_swa->data;
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
|
426
|
-
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
|
427
|
-
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
|
|
428
|
-
// Causal mask:
|
|
429
|
-
// xxx-------
|
|
430
|
-
// xxxx------
|
|
431
|
-
// xxxxx-----
|
|
432
|
-
// Non-causal mask:
|
|
433
|
-
// xxxxx-----
|
|
434
|
-
// xxxxx-----
|
|
435
|
-
// xxxxx-----
|
|
436
|
-
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
|
437
|
-
for (int h = 0; h < 1; ++h) {
|
|
438
|
-
for (int s = 0; s < n_seqs; ++s) {
|
|
439
|
-
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
|
440
|
-
|
|
441
|
-
for (int j = 0; j < n_seq_tokens; ++j) {
|
|
442
|
-
const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
|
|
443
|
-
for (int i = 0; i < n_kv; ++i) {
|
|
444
|
-
float f;
|
|
445
|
-
// mask the token if:
|
|
446
|
-
if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
|
|
447
|
-
|| (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
|
|
448
|
-
) {
|
|
449
|
-
f = -INFINITY;
|
|
450
|
-
} else {
|
|
451
|
-
if (hparams.use_alibi) {
|
|
452
|
-
f = -std::abs(kv_self->cells[i].pos - pos);
|
|
453
|
-
} else {
|
|
454
|
-
f = 0.0f;
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
if (data) {
|
|
459
|
-
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
// may need to cut off old tokens for sliding window
|
|
463
|
-
// TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
|
|
464
|
-
if (data_swa) {
|
|
465
|
-
if (hparams.n_attn_chunk) {
|
|
466
|
-
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
|
|
467
|
-
if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
|
|
468
|
-
f = -INFINITY;
|
|
469
|
-
}
|
|
470
|
-
} else {
|
|
471
|
-
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
|
|
472
|
-
f = -INFINITY;
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
}
|
|
479
|
-
}
|
|
364
|
+
if (self_kq_mask) {
|
|
365
|
+
kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
480
368
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
|
486
|
-
}
|
|
487
|
-
}
|
|
488
|
-
}
|
|
369
|
+
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
|
|
370
|
+
if (self_kq_mask) {
|
|
371
|
+
kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
|
372
|
+
}
|
|
489
373
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
|
493
|
-
for (int j = 0; j < n_kv; ++j) {
|
|
494
|
-
data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
}
|
|
374
|
+
if (self_kq_mask_swa) {
|
|
375
|
+
kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
|
499
376
|
}
|
|
500
377
|
}
|
|
501
378
|
|
|
@@ -545,7 +422,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|
|
545
422
|
n_layer (hparams.n_layer),
|
|
546
423
|
n_rot (hparams.n_rot),
|
|
547
424
|
n_ctx (cparams.n_ctx),
|
|
548
|
-
n_ctx_per_seq (cparams.n_ctx / cparams.n_seq_max),
|
|
549
425
|
n_head (hparams.n_head()),
|
|
550
426
|
n_head_kv (hparams.n_head_kv()),
|
|
551
427
|
n_embd_head_k (hparams.n_embd_head_k),
|
|
@@ -1153,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
|
|
|
1153
1029
|
|
|
1154
1030
|
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
|
|
1155
1031
|
|
|
1156
|
-
const auto n_kv = kv_self->
|
|
1032
|
+
const auto n_kv = kv_self->get_n();
|
|
1157
1033
|
|
|
1158
1034
|
auto & cur = inp->pos_bucket;
|
|
1159
1035
|
|
|
@@ -1188,16 +1064,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1188
1064
|
ggml_tensor * kq_b,
|
|
1189
1065
|
ggml_tensor * kq_mask,
|
|
1190
1066
|
ggml_tensor * v_mla,
|
|
1191
|
-
bool v_trans,
|
|
1192
1067
|
float kq_scale) const {
|
|
1193
|
-
|
|
1194
|
-
//const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
|
1195
|
-
|
|
1196
|
-
//const int64_t n_head = hparams.n_head(il);
|
|
1197
|
-
//const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
1068
|
+
const bool v_trans = v->nb[1] > v->nb[2];
|
|
1198
1069
|
|
|
1199
|
-
|
|
1200
|
-
|
|
1070
|
+
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
|
1071
|
+
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
|
1072
|
+
v = ggml_permute(ctx0, v, 0, 2, 1, 3);
|
|
1201
1073
|
|
|
1202
1074
|
const auto n_tokens = q->ne[1];
|
|
1203
1075
|
const auto n_head = q->ne[2];
|
|
@@ -1336,17 +1208,11 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1336
1208
|
|
|
1337
1209
|
const auto & kq_mask = inp->get_kq_mask();
|
|
1338
1210
|
|
|
1339
|
-
ggml_tensor * q =
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
|
|
1343
|
-
//cb(k, "k", il);
|
|
1344
|
-
|
|
1345
|
-
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
|
1346
|
-
//cb(k, "v", il);
|
|
1347
|
-
|
|
1348
|
-
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
|
|
1211
|
+
ggml_tensor * q = q_cur;
|
|
1212
|
+
ggml_tensor * k = k_cur;
|
|
1213
|
+
ggml_tensor * v = v_cur;
|
|
1349
1214
|
|
|
1215
|
+
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1350
1216
|
cb(cur, "kqv_out", il);
|
|
1351
1217
|
|
|
1352
1218
|
if (wo) {
|
|
@@ -1369,22 +1235,16 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
|
|
|
1369
1235
|
|
|
1370
1236
|
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
|
|
1371
1237
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
1375
|
-
//cb(inp->self_kq_mask, "KQ_mask", -1);
|
|
1376
|
-
ggml_set_input(inp->self_kq_mask);
|
|
1377
|
-
|
|
1378
|
-
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1238
|
+
{
|
|
1239
|
+
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
|
|
1379
1240
|
|
|
1380
|
-
|
|
1381
|
-
GGML_ASSERT(hparams.n_swa > 0);
|
|
1241
|
+
const auto n_kv = kv_self->get_n();
|
|
1382
1242
|
|
|
1383
|
-
inp->
|
|
1384
|
-
//cb(inp->
|
|
1385
|
-
ggml_set_input(inp->
|
|
1243
|
+
inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
1244
|
+
//cb(inp->self_kq_mask, "KQ_mask", -1);
|
|
1245
|
+
ggml_set_input(inp->self_kq_mask);
|
|
1386
1246
|
|
|
1387
|
-
inp->
|
|
1247
|
+
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1388
1248
|
}
|
|
1389
1249
|
|
|
1390
1250
|
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
|
|
@@ -1409,85 +1269,108 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1409
1269
|
ggml_build_forward_expand(gf, v_cur);
|
|
1410
1270
|
|
|
1411
1271
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
|
1412
|
-
const auto & n_ctx = cparams.n_ctx;
|
|
1413
1272
|
|
|
1414
|
-
|
|
1415
|
-
|
|
1273
|
+
// store to KV cache
|
|
1274
|
+
{
|
|
1275
|
+
ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
|
|
1276
|
+
ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
|
|
1277
|
+
}
|
|
1278
|
+
|
|
1279
|
+
const auto & kq_mask = inp->get_kq_mask();
|
|
1416
1280
|
|
|
1417
|
-
|
|
1281
|
+
ggml_tensor * q = q_cur;
|
|
1282
|
+
ggml_tensor * k = kv_self->get_k(ctx0, il);
|
|
1283
|
+
ggml_tensor * v = kv_self->get_v(ctx0, il);
|
|
1418
1284
|
|
|
1419
|
-
|
|
1285
|
+
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1286
|
+
cb(cur, "kqv_out", il);
|
|
1420
1287
|
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1288
|
+
if (wo) {
|
|
1289
|
+
cur = build_lora_mm(wo, cur);
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
if (wo_b) {
|
|
1293
|
+
cur = ggml_add(ctx0, cur, wo_b);
|
|
1294
|
+
}
|
|
1424
1295
|
|
|
1425
|
-
|
|
1296
|
+
return cur;
|
|
1297
|
+
}
|
|
1426
1298
|
|
|
1427
|
-
|
|
1428
|
-
|
|
1299
|
+
llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
|
|
1300
|
+
const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
|
|
1429
1301
|
|
|
1430
|
-
|
|
1431
|
-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
|
|
1302
|
+
auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
|
|
1432
1303
|
|
|
1433
|
-
|
|
1304
|
+
{
|
|
1305
|
+
const auto n_kv = kv_self->get_kv_base()->get_n();
|
|
1434
1306
|
|
|
1435
|
-
|
|
1307
|
+
inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
1308
|
+
//cb(inp->self_kq_mask, "KQ_mask", -1);
|
|
1309
|
+
ggml_set_input(inp->self_kq_mask);
|
|
1436
1310
|
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
} else {
|
|
1440
|
-
// note: the V cache is transposed when not using flash attention
|
|
1441
|
-
v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
|
|
1442
|
-
( n_ctx)*ggml_element_size(kv_self->v_l[il]),
|
|
1443
|
-
(kv_head)*ggml_element_size(kv_self->v_l[il]));
|
|
1311
|
+
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1312
|
+
}
|
|
1444
1313
|
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1314
|
+
{
|
|
1315
|
+
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
|
|
1316
|
+
|
|
1317
|
+
const auto n_kv = kv_self->get_kv_swa()->get_n();
|
|
1448
1318
|
|
|
1449
|
-
|
|
1319
|
+
inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
1320
|
+
//cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
|
|
1321
|
+
ggml_set_input(inp->self_kq_mask_swa);
|
|
1322
|
+
|
|
1323
|
+
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
1450
1324
|
}
|
|
1451
1325
|
|
|
1326
|
+
return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
ggml_tensor * llm_graph_context::build_attn(
|
|
1330
|
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
1331
|
+
ggml_cgraph * gf,
|
|
1332
|
+
ggml_tensor * wo,
|
|
1333
|
+
ggml_tensor * wo_b,
|
|
1334
|
+
ggml_tensor * q_cur,
|
|
1335
|
+
ggml_tensor * k_cur,
|
|
1336
|
+
ggml_tensor * v_cur,
|
|
1337
|
+
ggml_tensor * kq_b,
|
|
1338
|
+
ggml_tensor * v_mla,
|
|
1339
|
+
float kq_scale,
|
|
1340
|
+
int il) const {
|
|
1341
|
+
// these nodes are added to the graph together so that they are not reordered
|
|
1342
|
+
// by doing so, the number of splits in the graph is reduced
|
|
1343
|
+
ggml_build_forward_expand(gf, q_cur);
|
|
1344
|
+
ggml_build_forward_expand(gf, k_cur);
|
|
1345
|
+
ggml_build_forward_expand(gf, v_cur);
|
|
1346
|
+
|
|
1452
1347
|
const bool is_swa = hparams.is_swa(il);
|
|
1453
1348
|
|
|
1349
|
+
const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
|
|
1350
|
+
|
|
1351
|
+
const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
|
|
1352
|
+
|
|
1353
|
+
// store to KV cache
|
|
1354
|
+
{
|
|
1355
|
+
ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
|
|
1356
|
+
ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1454
1359
|
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
|
1455
1360
|
|
|
1456
|
-
|
|
1361
|
+
ggml_tensor * q = q_cur;
|
|
1362
|
+
ggml_tensor * k = kv->get_k(ctx0, il);
|
|
1363
|
+
ggml_tensor * v = kv->get_v(ctx0, il);
|
|
1457
1364
|
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
|
1461
|
-
const auto & n_embd_head_v = hparams.n_embd_head_v;
|
|
1462
|
-
|
|
1463
|
-
ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
|
|
1464
|
-
//cb(q, "q", il);
|
|
1465
|
-
|
|
1466
|
-
ggml_tensor * k =
|
|
1467
|
-
ggml_view_3d(ctx0, kv_self->k_l[il],
|
|
1468
|
-
n_embd_head_k, n_kv, n_head_kv,
|
|
1469
|
-
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
|
|
1470
|
-
ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
|
|
1471
|
-
0);
|
|
1472
|
-
//cb(k, "k", il);
|
|
1473
|
-
|
|
1474
|
-
ggml_tensor * v = !v_trans ?
|
|
1475
|
-
ggml_view_3d(ctx0, kv_self->v_l[il],
|
|
1476
|
-
n_embd_head_v, n_kv, n_head_kv,
|
|
1477
|
-
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
|
|
1478
|
-
ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
|
|
1479
|
-
0) :
|
|
1480
|
-
ggml_view_3d(ctx0, kv_self->v_l[il],
|
|
1481
|
-
n_kv, n_embd_head_v, n_head_kv,
|
|
1482
|
-
ggml_element_size(kv_self->v_l[il])*n_ctx,
|
|
1483
|
-
ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
|
|
1484
|
-
0);
|
|
1485
|
-
|
|
1486
|
-
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
|
|
1365
|
+
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1487
1366
|
cb(cur, "kqv_out", il);
|
|
1488
1367
|
|
|
1489
1368
|
if (wo) {
|
|
1490
1369
|
cur = build_lora_mm(wo, cur);
|
|
1370
|
+
if (arch == LLM_ARCH_GLM4) {
|
|
1371
|
+
// GLM4 seems to have numerical issues with half-precision accumulators
|
|
1372
|
+
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
|
1373
|
+
}
|
|
1491
1374
|
}
|
|
1492
1375
|
|
|
1493
1376
|
if (wo_b) {
|
|
@@ -1534,17 +1417,11 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1534
1417
|
|
|
1535
1418
|
const auto & kq_mask = inp->get_kq_mask_cross();
|
|
1536
1419
|
|
|
1537
|
-
ggml_tensor * q =
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
|
|
1541
|
-
//cb(k, "k", il);
|
|
1542
|
-
|
|
1543
|
-
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
|
1544
|
-
//cb(k, "v", il);
|
|
1545
|
-
|
|
1546
|
-
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
|
|
1420
|
+
ggml_tensor * q = q_cur;
|
|
1421
|
+
ggml_tensor * k = k_cur;
|
|
1422
|
+
ggml_tensor * v = v_cur;
|
|
1547
1423
|
|
|
1424
|
+
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
|
1548
1425
|
cb(cur, "kqv_out", il);
|
|
1549
1426
|
|
|
1550
1427
|
if (wo) {
|
|
@@ -1712,3 +1589,30 @@ void llm_graph_context::build_pooling(
|
|
|
1712
1589
|
|
|
1713
1590
|
ggml_build_forward_expand(gf, cur);
|
|
1714
1591
|
}
|
|
1592
|
+
|
|
1593
|
+
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
|
|
1594
|
+
// TODO move to hparams if a T5 variant appears that uses a different value
|
|
1595
|
+
const int64_t max_distance = 128;
|
|
1596
|
+
|
|
1597
|
+
if (bidirectional) {
|
|
1598
|
+
n_buckets >>= 1;
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
const int64_t max_exact = n_buckets >> 1;
|
|
1602
|
+
|
|
1603
|
+
int32_t relative_position = x - y;
|
|
1604
|
+
int32_t relative_bucket = 0;
|
|
1605
|
+
|
|
1606
|
+
if (bidirectional) {
|
|
1607
|
+
relative_bucket += (relative_position > 0) * n_buckets;
|
|
1608
|
+
relative_position = abs(relative_position);
|
|
1609
|
+
} else {
|
|
1610
|
+
relative_position = -std::min<int32_t>(relative_position, 0);
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
|
|
1614
|
+
relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
|
|
1615
|
+
relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
|
|
1616
|
+
|
|
1617
|
+
return relative_bucket;
|
|
1618
|
+
}
|
|
@@ -19,6 +19,7 @@ struct llama_cparams;
|
|
|
19
19
|
|
|
20
20
|
class llama_memory_i;
|
|
21
21
|
class llama_kv_cache_unified;
|
|
22
|
+
class llama_kv_cache_unified_iswa;
|
|
22
23
|
class llama_kv_cache_recurrent;
|
|
23
24
|
|
|
24
25
|
// certain models (typically multi-modal) can produce different types of graphs
|
|
@@ -255,6 +256,31 @@ public:
|
|
|
255
256
|
|
|
256
257
|
void set_input(const llama_ubatch * ubatch) override;
|
|
257
258
|
|
|
259
|
+
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
260
|
+
|
|
261
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
|
|
262
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
|
|
263
|
+
|
|
264
|
+
const llama_hparams & hparams;
|
|
265
|
+
const llama_cparams & cparams;
|
|
266
|
+
|
|
267
|
+
const llama_kv_cache_unified * kv_self;
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
|
|
271
|
+
public:
|
|
272
|
+
llm_graph_input_attn_kv_unified_iswa(
|
|
273
|
+
const llama_hparams & hparams,
|
|
274
|
+
const llama_cparams & cparams,
|
|
275
|
+
const llama_kv_cache_unified_iswa * kv_self) :
|
|
276
|
+
hparams(hparams),
|
|
277
|
+
cparams(cparams),
|
|
278
|
+
kv_self(kv_self) {
|
|
279
|
+
}
|
|
280
|
+
~llm_graph_input_attn_kv_unified_iswa() = default;
|
|
281
|
+
|
|
282
|
+
void set_input(const llama_ubatch * ubatch) override;
|
|
283
|
+
|
|
258
284
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
259
285
|
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
|
260
286
|
|
|
@@ -266,7 +292,7 @@ public:
|
|
|
266
292
|
const llama_hparams & hparams;
|
|
267
293
|
const llama_cparams & cparams;
|
|
268
294
|
|
|
269
|
-
const
|
|
295
|
+
const llama_kv_cache_unified_iswa * kv_self;
|
|
270
296
|
};
|
|
271
297
|
|
|
272
298
|
class llm_graph_input_attn_cross : public llm_graph_input_i {
|
|
@@ -378,7 +404,6 @@ struct llm_graph_context {
|
|
|
378
404
|
const int64_t n_layer;
|
|
379
405
|
const int64_t n_rot;
|
|
380
406
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
|
381
|
-
const int64_t n_ctx_per_seq;
|
|
382
407
|
const int64_t n_head;
|
|
383
408
|
const int64_t n_head_kv;
|
|
384
409
|
const int64_t n_embd_head_k;
|
|
@@ -507,13 +532,12 @@ struct llm_graph_context {
|
|
|
507
532
|
|
|
508
533
|
ggml_tensor * build_attn_mha(
|
|
509
534
|
ggml_cgraph * gf,
|
|
510
|
-
ggml_tensor * q,
|
|
511
|
-
ggml_tensor * k,
|
|
512
|
-
ggml_tensor * v,
|
|
535
|
+
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
|
536
|
+
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
|
537
|
+
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
|
513
538
|
ggml_tensor * kq_b,
|
|
514
539
|
ggml_tensor * kq_mask,
|
|
515
|
-
ggml_tensor * v_mla,
|
|
516
|
-
bool v_trans,
|
|
540
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
517
541
|
float kq_scale) const;
|
|
518
542
|
|
|
519
543
|
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
|
@@ -546,6 +570,21 @@ struct llm_graph_context {
|
|
|
546
570
|
float kq_scale,
|
|
547
571
|
int il) const;
|
|
548
572
|
|
|
573
|
+
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
|
|
574
|
+
|
|
575
|
+
ggml_tensor * build_attn(
|
|
576
|
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
577
|
+
ggml_cgraph * gf,
|
|
578
|
+
ggml_tensor * wo,
|
|
579
|
+
ggml_tensor * wo_b,
|
|
580
|
+
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
581
|
+
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
582
|
+
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
583
|
+
ggml_tensor * kq_b,
|
|
584
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
585
|
+
float kq_scale,
|
|
586
|
+
int il) const;
|
|
587
|
+
|
|
549
588
|
llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
|
550
589
|
|
|
551
590
|
ggml_tensor * build_attn(
|
|
@@ -596,3 +635,6 @@ struct llm_graph_context {
|
|
|
596
635
|
ggml_tensor * cls_out,
|
|
597
636
|
ggml_tensor * cls_out_b) const;
|
|
598
637
|
};
|
|
638
|
+
|
|
639
|
+
// TODO: better name
|
|
640
|
+
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
|
|
@@ -2,6 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
|
|
5
|
+
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
|
|
6
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
7
|
+
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
bool llama_hparams::is_swa_any() const {
|
|
12
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
13
|
+
if (swa_layers[il]) {
|
|
14
|
+
return true;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return false;
|
|
19
|
+
}
|
|
20
|
+
|
|
5
21
|
uint32_t llama_hparams::n_head(uint32_t il) const {
|
|
6
22
|
if (il < n_layer) {
|
|
7
23
|
return n_head_arr[il];
|
|
@@ -72,7 +88,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
|
|
|
72
88
|
|
|
73
89
|
bool llama_hparams::is_swa(uint32_t il) const {
|
|
74
90
|
if (il < n_layer) {
|
|
75
|
-
return
|
|
91
|
+
return swa_layers[il];
|
|
76
92
|
}
|
|
77
93
|
|
|
78
94
|
GGML_ABORT("fatal error");
|
|
@@ -14,6 +14,12 @@ enum llama_expert_gating_func_type {
|
|
|
14
14
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
|
15
15
|
};
|
|
16
16
|
|
|
17
|
+
enum llama_swa_type {
|
|
18
|
+
LLAMA_SWA_TYPE_NONE = 0,
|
|
19
|
+
LLAMA_SWA_TYPE_STANDARD = 1,
|
|
20
|
+
LLAMA_SWA_TYPE_CHUNKED = 2,
|
|
21
|
+
};
|
|
22
|
+
|
|
17
23
|
struct llama_hparams_posnet {
|
|
18
24
|
uint32_t n_embd;
|
|
19
25
|
uint32_t n_layer;
|
|
@@ -35,8 +41,6 @@ struct llama_hparams {
|
|
|
35
41
|
uint32_t n_embd_features = 0;
|
|
36
42
|
uint32_t n_layer;
|
|
37
43
|
uint32_t n_rot;
|
|
38
|
-
uint32_t n_swa = 0; // sliding window attention (SWA)
|
|
39
|
-
uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
|
|
40
44
|
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
|
41
45
|
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
|
42
46
|
uint32_t n_expert = 0;
|
|
@@ -96,6 +100,15 @@ struct llama_hparams {
|
|
|
96
100
|
|
|
97
101
|
std::array<int, 4> rope_sections;
|
|
98
102
|
|
|
103
|
+
// Sliding Window Attention (SWA)
|
|
104
|
+
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
105
|
+
// the size of the sliding window (0 - no SWA)
|
|
106
|
+
uint32_t n_swa = 0;
|
|
107
|
+
// if swa_layers[il] == true, then layer il is SWA
|
|
108
|
+
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
|
|
109
|
+
// by default, all layers are dense
|
|
110
|
+
std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
|
|
111
|
+
|
|
99
112
|
// for State Space Models
|
|
100
113
|
uint32_t ssm_d_conv = 0;
|
|
101
114
|
uint32_t ssm_d_inner = 0;
|
|
@@ -116,11 +129,10 @@ struct llama_hparams {
|
|
|
116
129
|
bool causal_attn = true;
|
|
117
130
|
bool use_alibi = false;
|
|
118
131
|
bool attn_soft_cap = false;
|
|
132
|
+
bool use_kq_norm = true;
|
|
119
133
|
|
|
134
|
+
// llama4
|
|
120
135
|
uint32_t n_moe_layer_step = 0;
|
|
121
|
-
bool use_kq_norm = true;
|
|
122
|
-
uint32_t n_attn_chunk = 0;
|
|
123
|
-
// values below seems to be fixed on llama4
|
|
124
136
|
uint32_t n_no_rope_layer_step = 4;
|
|
125
137
|
uint32_t n_attn_temp_floor_scale = 8192;
|
|
126
138
|
float f_attn_temp_scale = 0.1;
|
|
@@ -133,6 +145,23 @@ struct llama_hparams {
|
|
|
133
145
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
134
146
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
|
135
147
|
|
|
148
|
+
// this value n_pattern means that every nth layer is dense (i.e. non-SWA)
|
|
149
|
+
// note that if n_pattern == 0, all layers are SWA
|
|
150
|
+
// if n_pattern == 1, all layers are dense
|
|
151
|
+
// example: n_pattern = 3
|
|
152
|
+
// il == 0: swa
|
|
153
|
+
// il == 1: swa
|
|
154
|
+
// il == 2: dense
|
|
155
|
+
// il == 3: swa
|
|
156
|
+
// il == 4: swa
|
|
157
|
+
// il == 5: dense
|
|
158
|
+
// il == 6: swa
|
|
159
|
+
// etc ...
|
|
160
|
+
void set_swa_pattern(uint32_t n_pattern);
|
|
161
|
+
|
|
162
|
+
// return true if one of the layers is SWA
|
|
163
|
+
bool is_swa_any() const;
|
|
164
|
+
|
|
136
165
|
uint32_t n_head(uint32_t il = 0) const;
|
|
137
166
|
|
|
138
167
|
uint32_t n_head_kv(uint32_t il = 0) const;
|