@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +3 -1
- package/lib/index.js +2 -0
- package/lib/index.ts +3 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +27 -26
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +28 -7
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +14 -17
- package/src/common.hpp +7 -6
- package/src/llama.cpp/CMakeLists.txt +15 -4
- package/src/llama.cpp/common/CMakeLists.txt +15 -24
- package/src/llama.cpp/common/arg.cpp +172 -110
- package/src/llama.cpp/common/chat-parser.cpp +385 -0
- package/src/llama.cpp/common/chat-parser.h +120 -0
- package/src/llama.cpp/common/chat.cpp +726 -596
- package/src/llama.cpp/common/chat.h +74 -8
- package/src/llama.cpp/common/common.cpp +56 -38
- package/src/llama.cpp/common/common.h +9 -3
- package/src/llama.cpp/common/json-partial.cpp +256 -0
- package/src/llama.cpp/common/json-partial.h +38 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/src/llama.cpp/common/sampling.cpp +7 -8
- package/src/llama.cpp/common/speculative.cpp +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
- package/src/llama.cpp/ggml/include/ggml.h +22 -3
- package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
- package/src/llama.cpp/include/llama.h +145 -40
- package/src/llama.cpp/src/CMakeLists.txt +5 -1
- package/src/llama.cpp/src/llama-arch.cpp +99 -3
- package/src/llama.cpp/src/llama-arch.h +10 -1
- package/src/llama.cpp/src/llama-batch.cpp +728 -272
- package/src/llama.cpp/src/llama-batch.h +112 -54
- package/src/llama.cpp/src/llama-chat.cpp +19 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +525 -339
- package/src/llama.cpp/src/llama-context.h +38 -17
- package/src/llama.cpp/src/llama-cparams.cpp +4 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-grammar.cpp +12 -2
- package/src/llama.cpp/src/llama-graph.cpp +413 -353
- package/src/llama.cpp/src/llama-graph.h +112 -56
- package/src/llama.cpp/src/llama-hparams.cpp +10 -2
- package/src/llama.cpp/src/llama-hparams.h +13 -2
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
- package/src/llama.cpp/src/llama-kv-cells.h +415 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
- package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
- package/src/llama.cpp/src/llama-memory.cpp +41 -0
- package/src/llama.cpp/src/llama-memory.h +86 -5
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1137 -528
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +69 -32
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +11 -7
- package/src/llama.cpp/src/unicode.cpp +5 -0
- package/src/tts_utils.h +1 -1
- package/src/llama.cpp/common/json.hpp +0 -24766
- package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
- package/src/llama.cpp/common/minja/minja.hpp +0 -2974
- package/src/llama.cpp/common/stb_image.h +0 -7988
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
- package/src/llama.cpp/src/llama-kv-cache.h +0 -515
- /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -5,7 +5,11 @@
|
|
|
5
5
|
#include "llama-batch.h"
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
#include "llama-kv-cache-unified.h"
|
|
10
|
+
#include "llama-kv-cache-unified-iswa.h"
|
|
11
|
+
#include "llama-memory-hybrid.h"
|
|
12
|
+
#include "llama-memory-recurrent.h"
|
|
9
13
|
|
|
10
14
|
#include "ggml-cpp.h"
|
|
11
15
|
|
|
@@ -77,6 +81,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
77
81
|
case LLM_TYPE_40B: return "40B";
|
|
78
82
|
case LLM_TYPE_65B: return "65B";
|
|
79
83
|
case LLM_TYPE_70B: return "70B";
|
|
84
|
+
case LLM_TYPE_142B: return "142B";
|
|
80
85
|
case LLM_TYPE_236B: return "236B";
|
|
81
86
|
case LLM_TYPE_290B: return "290B";
|
|
82
87
|
case LLM_TYPE_314B: return "314B";
|
|
@@ -466,6 +471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
466
471
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
467
472
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
468
473
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
474
|
+
std::fill(
|
|
475
|
+
hparams.recurrent_layer_arr.begin(),
|
|
476
|
+
hparams.recurrent_layer_arr.end(),
|
|
477
|
+
llm_arch_is_recurrent(ml.get_arch()));
|
|
469
478
|
|
|
470
479
|
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
|
|
471
480
|
|
|
@@ -540,6 +549,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
540
549
|
uint32_t n_vocab = 0;
|
|
541
550
|
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
|
542
551
|
|
|
552
|
+
// for classifier models
|
|
553
|
+
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
|
|
554
|
+
if (!classifier_labels.empty()) {
|
|
555
|
+
hparams.n_cls_out = classifier_labels.size();
|
|
556
|
+
}
|
|
557
|
+
|
|
543
558
|
// arch-specific KVs
|
|
544
559
|
switch (arch) {
|
|
545
560
|
case LLM_ARCH_LLAMA:
|
|
@@ -589,6 +604,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
589
604
|
hparams.use_kq_norm = false;
|
|
590
605
|
}
|
|
591
606
|
} break;
|
|
607
|
+
case LLM_ARCH_ARCEE:
|
|
608
|
+
{
|
|
609
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
610
|
+
|
|
611
|
+
// Arcee uses the same structure as Llama
|
|
612
|
+
switch (hparams.n_layer) {
|
|
613
|
+
case 36: type = LLM_TYPE_4B; break;
|
|
614
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
615
|
+
}
|
|
616
|
+
} break;
|
|
592
617
|
case LLM_ARCH_DECI:
|
|
593
618
|
{
|
|
594
619
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -729,6 +754,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
729
754
|
}
|
|
730
755
|
}
|
|
731
756
|
} break;
|
|
757
|
+
case LLM_ARCH_NEO_BERT:
|
|
758
|
+
{
|
|
759
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
760
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
761
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
762
|
+
|
|
763
|
+
if (hparams.n_layer == 28) {
|
|
764
|
+
type = LLM_TYPE_250M;
|
|
765
|
+
}
|
|
766
|
+
} break;
|
|
732
767
|
case LLM_ARCH_BLOOM:
|
|
733
768
|
{
|
|
734
769
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -952,6 +987,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
952
987
|
case 46: type = LLM_TYPE_27B; break;
|
|
953
988
|
default: type = LLM_TYPE_UNKNOWN;
|
|
954
989
|
}
|
|
990
|
+
|
|
991
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
|
|
992
|
+
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
993
|
+
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
994
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
955
995
|
} break;
|
|
956
996
|
case LLM_ARCH_GEMMA3:
|
|
957
997
|
{
|
|
@@ -972,6 +1012,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
972
1012
|
default: type = LLM_TYPE_UNKNOWN;
|
|
973
1013
|
}
|
|
974
1014
|
|
|
1015
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
|
|
975
1016
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
976
1017
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
977
1018
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
@@ -1429,6 +1470,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1429
1470
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1430
1471
|
}
|
|
1431
1472
|
} break;
|
|
1473
|
+
case LLM_ARCH_DOTS1:
|
|
1474
|
+
{
|
|
1475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1477
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1478
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1479
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1480
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1481
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1482
|
+
switch (hparams.n_layer) {
|
|
1483
|
+
case 62: type = LLM_TYPE_142B; break;
|
|
1484
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1485
|
+
}
|
|
1486
|
+
} break;
|
|
1432
1487
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1433
1488
|
}
|
|
1434
1489
|
|
|
@@ -2113,7 +2168,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2113
2168
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
2114
2169
|
{
|
|
2115
2170
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2116
|
-
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types},
|
|
2171
|
+
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
2117
2172
|
|
|
2118
2173
|
if (arch == LLM_ARCH_BERT) {
|
|
2119
2174
|
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
|
@@ -2121,8 +2176,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2121
2176
|
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2122
2177
|
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2123
2178
|
|
|
2124
|
-
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd,
|
|
2125
|
-
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {
|
|
2179
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2180
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2126
2181
|
}
|
|
2127
2182
|
|
|
2128
2183
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
@@ -2131,7 +2186,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2131
2186
|
for (int i = 0; i < n_layer; ++i) {
|
|
2132
2187
|
auto & layer = layers[i];
|
|
2133
2188
|
|
|
2134
|
-
|
|
2189
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2190
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2191
|
+
|
|
2192
|
+
if (!layer.wqkv) {
|
|
2135
2193
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2136
2194
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2137
2195
|
|
|
@@ -2140,12 +2198,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2140
2198
|
|
|
2141
2199
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2142
2200
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2143
|
-
} else {
|
|
2144
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2145
|
-
}
|
|
2146
|
-
|
|
2147
|
-
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2148
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2149
2201
|
}
|
|
2150
2202
|
|
|
2151
2203
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
@@ -2175,6 +2227,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2175
2227
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2176
2228
|
}
|
|
2177
2229
|
} break;
|
|
2230
|
+
case LLM_ARCH_NEO_BERT:
|
|
2231
|
+
{
|
|
2232
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2233
|
+
|
|
2234
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2235
|
+
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2236
|
+
|
|
2237
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2238
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
2239
|
+
|
|
2240
|
+
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2241
|
+
|
|
2242
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2243
|
+
auto & layer = layers[i];
|
|
2244
|
+
|
|
2245
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2246
|
+
|
|
2247
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2248
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2249
|
+
|
|
2250
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2251
|
+
|
|
2252
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
|
|
2253
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2254
|
+
}
|
|
2255
|
+
} break;
|
|
2178
2256
|
case LLM_ARCH_JINA_BERT_V2:
|
|
2179
2257
|
{
|
|
2180
2258
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
|
@@ -2212,8 +2290,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2212
2290
|
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2213
2291
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2214
2292
|
|
|
2215
|
-
layer.
|
|
2216
|
-
layer.
|
|
2293
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2294
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
|
2217
2295
|
|
|
2218
2296
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2219
2297
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
@@ -2489,7 +2567,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2489
2567
|
|
|
2490
2568
|
// output
|
|
2491
2569
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2492
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
2570
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2571
|
+
// if output is NULL, init from the input tok embed
|
|
2572
|
+
if (output == NULL) {
|
|
2573
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2574
|
+
}
|
|
2493
2575
|
|
|
2494
2576
|
for (int i = 0; i < n_layer; ++i) {
|
|
2495
2577
|
auto & layer = layers[i];
|
|
@@ -4107,6 +4189,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4107
4189
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4108
4190
|
}
|
|
4109
4191
|
} break;
|
|
4192
|
+
case LLM_ARCH_DOTS1:
|
|
4193
|
+
{
|
|
4194
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4195
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4196
|
+
|
|
4197
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4198
|
+
|
|
4199
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4200
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4201
|
+
|
|
4202
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4203
|
+
auto & layer = layers[i];
|
|
4204
|
+
|
|
4205
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4206
|
+
|
|
4207
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4208
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4209
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4210
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4211
|
+
|
|
4212
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4213
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4214
|
+
|
|
4215
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4216
|
+
|
|
4217
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
4218
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4219
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4220
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4221
|
+
} else {
|
|
4222
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4223
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
4224
|
+
|
|
4225
|
+
if (n_expert == 0) {
|
|
4226
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4227
|
+
}
|
|
4228
|
+
if (n_expert_used == 0) {
|
|
4229
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4230
|
+
}
|
|
4231
|
+
|
|
4232
|
+
// MoE branch
|
|
4233
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4234
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4235
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4236
|
+
|
|
4237
|
+
// Shared expert branch
|
|
4238
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4239
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4240
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4241
|
+
}
|
|
4242
|
+
}
|
|
4243
|
+
} break;
|
|
4244
|
+
case LLM_ARCH_ARCEE:
|
|
4245
|
+
{
|
|
4246
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4247
|
+
|
|
4248
|
+
// output
|
|
4249
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4250
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4251
|
+
|
|
4252
|
+
// if output is NULL, init from the input tok embed
|
|
4253
|
+
if (output == NULL) {
|
|
4254
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4255
|
+
}
|
|
4256
|
+
|
|
4257
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4258
|
+
auto & layer = layers[i];
|
|
4259
|
+
|
|
4260
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4261
|
+
|
|
4262
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4263
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4264
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4265
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4266
|
+
|
|
4267
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4268
|
+
|
|
4269
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4270
|
+
|
|
4271
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4272
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4273
|
+
}
|
|
4274
|
+
} break;
|
|
4110
4275
|
default:
|
|
4111
4276
|
throw std::runtime_error("unknown architecture");
|
|
4112
4277
|
}
|
|
@@ -4351,6 +4516,15 @@ void llama_model::print_info() const {
|
|
|
4351
4516
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4352
4517
|
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4353
4518
|
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4519
|
+
|
|
4520
|
+
if (!classifier_labels.empty()) {
|
|
4521
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
4522
|
+
|
|
4523
|
+
size_t i = 0;
|
|
4524
|
+
for (auto label : classifier_labels) {
|
|
4525
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
4526
|
+
}
|
|
4527
|
+
}
|
|
4354
4528
|
}
|
|
4355
4529
|
|
|
4356
4530
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
@@ -4533,6 +4707,8 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4533
4707
|
|
|
4534
4708
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4535
4709
|
|
|
4710
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4711
|
+
|
|
4536
4712
|
for (int il = 0; il < n_layer; ++il) {
|
|
4537
4713
|
ggml_tensor * inpSA = inpL;
|
|
4538
4714
|
|
|
@@ -4595,9 +4771,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4595
4771
|
cb(cur, "attn_out", il);
|
|
4596
4772
|
}
|
|
4597
4773
|
|
|
4598
|
-
if (il == n_layer - 1) {
|
|
4599
|
-
// skip computing output for unused tokens
|
|
4600
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4774
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4601
4775
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4602
4776
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4603
4777
|
}
|
|
@@ -4693,6 +4867,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
4693
4867
|
|
|
4694
4868
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4695
4869
|
|
|
4870
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4871
|
+
|
|
4696
4872
|
for (int il = 0; il < n_layer; ++il) {
|
|
4697
4873
|
ggml_tensor * inpSA = inpL;
|
|
4698
4874
|
|
|
@@ -4769,9 +4945,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
4769
4945
|
cb(cur, "attn_out", il);
|
|
4770
4946
|
}
|
|
4771
4947
|
|
|
4772
|
-
if (il == n_layer - 1) {
|
|
4773
|
-
// skip computing output for unused tokens
|
|
4774
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
4948
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4775
4949
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4776
4950
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4777
4951
|
}
|
|
@@ -4871,6 +5045,9 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4871
5045
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4872
5046
|
|
|
4873
5047
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5048
|
+
|
|
5049
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5050
|
+
|
|
4874
5051
|
for (int il = 0; il < n_layer; ++il) {
|
|
4875
5052
|
ggml_tensor * inpSA = inpL;
|
|
4876
5053
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
@@ -4944,9 +5121,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4944
5121
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4945
5122
|
}
|
|
4946
5123
|
|
|
4947
|
-
if (il == n_layer - 1) {
|
|
4948
|
-
// skip computing output for unused tokens
|
|
4949
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5124
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
4950
5125
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
4951
5126
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
4952
5127
|
}
|
|
@@ -5025,6 +5200,8 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5025
5200
|
|
|
5026
5201
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5027
5202
|
|
|
5203
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5204
|
+
|
|
5028
5205
|
for (int il = 0; il < n_layer; ++il) {
|
|
5029
5206
|
ggml_tensor * inpSA = inpL;
|
|
5030
5207
|
|
|
@@ -5076,9 +5253,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5076
5253
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5077
5254
|
}
|
|
5078
5255
|
|
|
5079
|
-
if (il == n_layer - 1) {
|
|
5080
|
-
// skip computing output for unused tokens
|
|
5081
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5256
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5082
5257
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5083
5258
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5084
5259
|
}
|
|
@@ -5147,6 +5322,8 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5147
5322
|
|
|
5148
5323
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5149
5324
|
|
|
5325
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5326
|
+
|
|
5150
5327
|
for (int il = 0; il < n_layer; ++il) {
|
|
5151
5328
|
ggml_tensor * inpSA = inpL;
|
|
5152
5329
|
|
|
@@ -5191,9 +5368,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5191
5368
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5192
5369
|
}
|
|
5193
5370
|
|
|
5194
|
-
if (il == n_layer - 1) {
|
|
5195
|
-
// skip computing output for unused tokens
|
|
5196
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5371
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5197
5372
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5198
5373
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5199
5374
|
}
|
|
@@ -5261,6 +5436,8 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5261
5436
|
|
|
5262
5437
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5263
5438
|
|
|
5439
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5440
|
+
|
|
5264
5441
|
for (int il = 0; il < n_layer; ++il) {
|
|
5265
5442
|
ggml_tensor * attn_norm;
|
|
5266
5443
|
|
|
@@ -5316,9 +5493,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5316
5493
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5317
5494
|
}
|
|
5318
5495
|
|
|
5319
|
-
if (il == n_layer - 1) {
|
|
5320
|
-
// skip computing output for unused tokens
|
|
5321
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5496
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5322
5497
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5323
5498
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5324
5499
|
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
|
@@ -5387,6 +5562,8 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
5387
5562
|
|
|
5388
5563
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5389
5564
|
|
|
5565
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5566
|
+
|
|
5390
5567
|
for (int il = 0; il < n_layer; ++il) {
|
|
5391
5568
|
ggml_tensor * inpSA = inpL;
|
|
5392
5569
|
|
|
@@ -5446,9 +5623,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
5446
5623
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
5447
5624
|
}
|
|
5448
5625
|
|
|
5449
|
-
if (il == n_layer - 1) {
|
|
5450
|
-
// skip computing output for unused tokens
|
|
5451
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5626
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5452
5627
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5453
5628
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5454
5629
|
}
|
|
@@ -5547,6 +5722,8 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5547
5722
|
|
|
5548
5723
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5549
5724
|
|
|
5725
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5726
|
+
|
|
5550
5727
|
for (int il = 0; il < n_layer; ++il) {
|
|
5551
5728
|
ggml_tensor * inpSA = inpL;
|
|
5552
5729
|
|
|
@@ -5597,9 +5774,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5597
5774
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5598
5775
|
}
|
|
5599
5776
|
|
|
5600
|
-
if (il == n_layer - 1) {
|
|
5601
|
-
// skip computing output for unused tokens
|
|
5602
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5777
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5603
5778
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5604
5779
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5605
5780
|
}
|
|
@@ -5679,6 +5854,8 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5679
5854
|
inpL = ggml_add(ctx0, inpL, pos);
|
|
5680
5855
|
cb(inpL, "inpL", -1);
|
|
5681
5856
|
|
|
5857
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5858
|
+
|
|
5682
5859
|
for (int il = 0; il < n_layer; ++il) {
|
|
5683
5860
|
cur = build_norm(inpL,
|
|
5684
5861
|
model.layers[il].attn_norm,
|
|
@@ -5711,9 +5888,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5711
5888
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5712
5889
|
}
|
|
5713
5890
|
|
|
5714
|
-
if (il == n_layer - 1) {
|
|
5715
|
-
// skip computing output for unused tokens
|
|
5716
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5891
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5717
5892
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5718
5893
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5719
5894
|
}
|
|
@@ -5778,6 +5953,8 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5778
5953
|
|
|
5779
5954
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
5780
5955
|
|
|
5956
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5957
|
+
|
|
5781
5958
|
for (int il = 0; il < n_layer; ++il) {
|
|
5782
5959
|
ggml_tensor * inpSA = inpL;
|
|
5783
5960
|
|
|
@@ -5810,9 +5987,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5810
5987
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5811
5988
|
}
|
|
5812
5989
|
|
|
5813
|
-
if (il == n_layer - 1) {
|
|
5814
|
-
// skip computing output for unused tokens
|
|
5815
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
5990
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5816
5991
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5817
5992
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
5818
5993
|
}
|
|
@@ -5883,8 +6058,10 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5883
6058
|
inpL = build_inp_embd(model.tok_embd);
|
|
5884
6059
|
|
|
5885
6060
|
// token types are hardcoded to zero ("Sentence A")
|
|
5886
|
-
|
|
5887
|
-
|
|
6061
|
+
if (model.type_embd) {
|
|
6062
|
+
ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
|
6063
|
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
|
6064
|
+
}
|
|
5888
6065
|
if (model.arch == LLM_ARCH_BERT) {
|
|
5889
6066
|
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
|
5890
6067
|
}
|
|
@@ -5896,17 +6073,34 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5896
6073
|
|
|
5897
6074
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
5898
6075
|
|
|
5899
|
-
|
|
6076
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6077
|
+
|
|
5900
6078
|
for (int il = 0; il < n_layer; ++il) {
|
|
5901
6079
|
ggml_tensor * cur = inpL;
|
|
5902
6080
|
|
|
5903
|
-
|
|
5904
|
-
|
|
5905
|
-
|
|
6081
|
+
{
|
|
6082
|
+
ggml_tensor * Qcur;
|
|
6083
|
+
ggml_tensor * Kcur;
|
|
6084
|
+
ggml_tensor * Vcur;
|
|
5906
6085
|
|
|
5907
|
-
|
|
5908
|
-
|
|
5909
|
-
|
|
6086
|
+
// self-attention
|
|
6087
|
+
if (model.layers[il].wqkv) {
|
|
6088
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6089
|
+
cb(cur, "wqkv", il);
|
|
6090
|
+
|
|
6091
|
+
if (model.layers[il].bqkv) {
|
|
6092
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6093
|
+
cb(cur, "bqkv", il);
|
|
6094
|
+
}
|
|
6095
|
+
|
|
6096
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6097
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6098
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6099
|
+
} else {
|
|
6100
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
6101
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
6102
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
6103
|
+
}
|
|
5910
6104
|
|
|
5911
6105
|
if (model.layers[il].attn_q_norm) {
|
|
5912
6106
|
Qcur = build_norm(Qcur,
|
|
@@ -5915,8 +6109,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5915
6109
|
LLM_NORM, il);
|
|
5916
6110
|
}
|
|
5917
6111
|
|
|
5918
|
-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
5919
|
-
|
|
5920
6112
|
if (model.layers[il].attn_k_norm) {
|
|
5921
6113
|
Kcur = build_norm(Kcur,
|
|
5922
6114
|
model.layers[il].attn_k_norm,
|
|
@@ -5924,54 +6116,36 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5924
6116
|
LLM_NORM, il);
|
|
5925
6117
|
}
|
|
5926
6118
|
|
|
5927
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
5928
|
-
|
|
5929
6119
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5930
6120
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5931
6121
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5932
|
-
} else {
|
|
5933
|
-
// compute Q and K and RoPE them
|
|
5934
|
-
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5935
|
-
cb(cur, "wqkv", il);
|
|
5936
|
-
|
|
5937
|
-
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5938
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
5939
|
-
cb(cur, "bqkv", il);
|
|
5940
|
-
}
|
|
5941
6122
|
|
|
5942
|
-
|
|
5943
|
-
|
|
5944
|
-
|
|
6123
|
+
// RoPE
|
|
6124
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
6125
|
+
Qcur = ggml_rope_ext(
|
|
6126
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6127
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6128
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6129
|
+
);
|
|
5945
6130
|
|
|
5946
|
-
|
|
5947
|
-
|
|
5948
|
-
|
|
6131
|
+
Kcur = ggml_rope_ext(
|
|
6132
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6133
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6134
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6135
|
+
);
|
|
6136
|
+
}
|
|
5949
6137
|
|
|
5950
|
-
Qcur
|
|
5951
|
-
|
|
5952
|
-
|
|
5953
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
5954
|
-
);
|
|
6138
|
+
cb(Qcur, "Qcur", il);
|
|
6139
|
+
cb(Kcur, "Kcur", il);
|
|
6140
|
+
cb(Vcur, "Vcur", il);
|
|
5955
6141
|
|
|
5956
|
-
|
|
5957
|
-
|
|
5958
|
-
|
|
5959
|
-
|
|
5960
|
-
);
|
|
6142
|
+
cur = build_attn(inp_attn, gf,
|
|
6143
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
6144
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6145
|
+
cb(cur, "kqv_out", il);
|
|
5961
6146
|
}
|
|
5962
6147
|
|
|
5963
|
-
|
|
5964
|
-
cb(Kcur, "Kcur", il);
|
|
5965
|
-
cb(Vcur, "Vcur", il);
|
|
5966
|
-
|
|
5967
|
-
cur = build_attn(inp_attn, gf,
|
|
5968
|
-
model.layers[il].wo, model.layers[il].bo,
|
|
5969
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5970
|
-
cb(cur, "kqv_out", il);
|
|
5971
|
-
|
|
5972
|
-
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
5973
|
-
// skip computing output for unused tokens
|
|
5974
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6148
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
5975
6149
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
5976
6150
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
5977
6151
|
}
|
|
@@ -6020,7 +6194,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6020
6194
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
6021
6195
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6022
6196
|
NULL,
|
|
6023
|
-
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
6197
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
|
6024
6198
|
cb(cur, "ffn_out", il);
|
|
6025
6199
|
} else {
|
|
6026
6200
|
cur = build_ffn(cur,
|
|
@@ -6051,8 +6225,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6051
6225
|
}
|
|
6052
6226
|
};
|
|
6053
6227
|
|
|
6054
|
-
struct
|
|
6055
|
-
|
|
6228
|
+
struct llm_build_neo_bert : public llm_graph_context {
|
|
6229
|
+
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6056
6230
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6057
6231
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6058
6232
|
|
|
@@ -6060,52 +6234,164 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6060
6234
|
|
|
6061
6235
|
ggml_tensor * cur;
|
|
6062
6236
|
ggml_tensor * inpL;
|
|
6237
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
6063
6238
|
|
|
6239
|
+
// construct input embeddings (token, type, position)
|
|
6064
6240
|
inpL = build_inp_embd(model.tok_embd);
|
|
6241
|
+
cb(inpL, "inp_embd", -1);
|
|
6065
6242
|
|
|
6066
|
-
auto * inp_attn =
|
|
6243
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
6067
6244
|
|
|
6068
|
-
|
|
6069
|
-
model.tok_norm,
|
|
6070
|
-
model.tok_norm_b,
|
|
6071
|
-
LLM_NORM, -1);
|
|
6072
|
-
cb(inpL, "inp_norm", -1);
|
|
6245
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6073
6246
|
|
|
6074
6247
|
for (int il = 0; il < n_layer; ++il) {
|
|
6248
|
+
ggml_tensor * cur = inpL;
|
|
6249
|
+
|
|
6250
|
+
// pre-norm
|
|
6075
6251
|
cur = build_norm(inpL,
|
|
6076
|
-
model.layers[il].attn_norm,
|
|
6077
|
-
|
|
6078
|
-
LLM_NORM, il);
|
|
6079
|
-
cb(cur, "attn_norm", il);
|
|
6252
|
+
model.layers[il].attn_norm, NULL,
|
|
6253
|
+
LLM_NORM_RMS, il);
|
|
6080
6254
|
|
|
6081
|
-
// self-attention
|
|
6082
6255
|
{
|
|
6256
|
+
ggml_tensor * Qcur;
|
|
6257
|
+
ggml_tensor * Kcur;
|
|
6258
|
+
ggml_tensor * Vcur;
|
|
6259
|
+
|
|
6260
|
+
// self-attention
|
|
6083
6261
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6084
6262
|
cb(cur, "wqkv", il);
|
|
6085
6263
|
|
|
6086
|
-
|
|
6087
|
-
|
|
6088
|
-
|
|
6089
|
-
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6090
|
-
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6091
|
-
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6264
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6265
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6266
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6092
6267
|
|
|
6093
6268
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6094
6269
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6095
6270
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6096
6271
|
|
|
6097
|
-
|
|
6098
|
-
|
|
6099
|
-
|
|
6100
|
-
|
|
6272
|
+
// RoPE
|
|
6273
|
+
Qcur = ggml_rope_ext(
|
|
6274
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6275
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6276
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6277
|
+
);
|
|
6278
|
+
|
|
6279
|
+
Kcur = ggml_rope_ext(
|
|
6280
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6281
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6282
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6283
|
+
);
|
|
6284
|
+
|
|
6285
|
+
cb(Qcur, "Qcur", il);
|
|
6286
|
+
cb(Kcur, "Kcur", il);
|
|
6287
|
+
cb(Vcur, "Vcur", il);
|
|
6288
|
+
|
|
6289
|
+
cur = build_attn(inp_attn, gf,
|
|
6290
|
+
model.layers[il].wo, nullptr,
|
|
6291
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6292
|
+
cb(cur, "kqv_out", il);
|
|
6293
|
+
}
|
|
6294
|
+
|
|
6295
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6296
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6297
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6298
|
+
}
|
|
6299
|
+
|
|
6300
|
+
// re-add the layer input
|
|
6301
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
6302
|
+
|
|
6303
|
+
ggml_tensor * ffn_inp = cur;
|
|
6304
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6305
|
+
|
|
6306
|
+
// pre-norm
|
|
6307
|
+
cur = build_norm(ffn_inp,
|
|
6308
|
+
model.layers[il].ffn_norm, NULL,
|
|
6309
|
+
LLM_NORM_RMS, il);
|
|
6310
|
+
cb(cur, "ffn_norm", il);
|
|
6311
|
+
|
|
6312
|
+
// feed-forward network
|
|
6313
|
+
cur = build_ffn(cur,
|
|
6314
|
+
model.layers[il].ffn_up,
|
|
6315
|
+
NULL, NULL, NULL, NULL, NULL,
|
|
6316
|
+
model.layers[il].ffn_down,
|
|
6317
|
+
NULL, NULL, NULL,
|
|
6318
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
6319
|
+
|
|
6320
|
+
// attentions bypass the intermediate layer
|
|
6321
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6322
|
+
|
|
6323
|
+
// input for next layer
|
|
6324
|
+
inpL = cur;
|
|
6325
|
+
}
|
|
6326
|
+
|
|
6327
|
+
cur = inpL;
|
|
6328
|
+
|
|
6329
|
+
cur = build_norm(cur,
|
|
6330
|
+
model.output_norm_enc, NULL,
|
|
6331
|
+
LLM_NORM_RMS, -1);
|
|
6332
|
+
|
|
6333
|
+
cb(cur, "result_embd", -1);
|
|
6334
|
+
res->t_embd = cur;
|
|
6335
|
+
|
|
6336
|
+
ggml_build_forward_expand(gf, cur);
|
|
6337
|
+
}
|
|
6338
|
+
};
|
|
6339
|
+
|
|
6340
|
+
struct llm_build_bloom : public llm_graph_context {
|
|
6341
|
+
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6342
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6343
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6344
|
+
|
|
6345
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6346
|
+
|
|
6347
|
+
ggml_tensor * cur;
|
|
6348
|
+
ggml_tensor * inpL;
|
|
6349
|
+
|
|
6350
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
6351
|
+
|
|
6352
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6353
|
+
|
|
6354
|
+
inpL = build_norm(inpL,
|
|
6355
|
+
model.tok_norm,
|
|
6356
|
+
model.tok_norm_b,
|
|
6357
|
+
LLM_NORM, -1);
|
|
6358
|
+
cb(inpL, "inp_norm", -1);
|
|
6359
|
+
|
|
6360
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6361
|
+
|
|
6362
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
6363
|
+
cur = build_norm(inpL,
|
|
6364
|
+
model.layers[il].attn_norm,
|
|
6365
|
+
model.layers[il].attn_norm_b,
|
|
6366
|
+
LLM_NORM, il);
|
|
6367
|
+
cb(cur, "attn_norm", il);
|
|
6368
|
+
|
|
6369
|
+
// self-attention
|
|
6370
|
+
{
|
|
6371
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6372
|
+
cb(cur, "wqkv", il);
|
|
6373
|
+
|
|
6374
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6375
|
+
cb(cur, "bqkv", il);
|
|
6376
|
+
|
|
6377
|
+
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6378
|
+
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6379
|
+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6380
|
+
|
|
6381
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6382
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6383
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6384
|
+
|
|
6385
|
+
cb(Qcur, "Qcur", il);
|
|
6386
|
+
cb(Kcur, "Kcur", il);
|
|
6387
|
+
cb(Vcur, "Vcur", il);
|
|
6388
|
+
|
|
6101
6389
|
cur = build_attn(inp_attn, gf,
|
|
6102
6390
|
model.layers[il].wo, model.layers[il].bo,
|
|
6103
6391
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6104
6392
|
}
|
|
6105
6393
|
|
|
6106
|
-
if (il == n_layer - 1) {
|
|
6107
|
-
// skip computing output for unused tokens
|
|
6108
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6394
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6109
6395
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6110
6396
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6111
6397
|
}
|
|
@@ -6182,6 +6468,8 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6182
6468
|
cb(inpL, "inpL", -1);
|
|
6183
6469
|
}
|
|
6184
6470
|
|
|
6471
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6472
|
+
|
|
6185
6473
|
for (int il = 0; il < n_layer; ++il) {
|
|
6186
6474
|
ggml_tensor * attn_norm;
|
|
6187
6475
|
|
|
@@ -6244,9 +6532,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6244
6532
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6245
6533
|
}
|
|
6246
6534
|
|
|
6247
|
-
if (il == n_layer - 1) {
|
|
6248
|
-
// skip computing output for unused tokens
|
|
6249
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6535
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6250
6536
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6251
6537
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6252
6538
|
}
|
|
@@ -6315,6 +6601,8 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
6315
6601
|
|
|
6316
6602
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6317
6603
|
|
|
6604
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6605
|
+
|
|
6318
6606
|
for (int il = 0; il < n_layer; ++il) {
|
|
6319
6607
|
// norm
|
|
6320
6608
|
cur = build_norm(inpL,
|
|
@@ -6390,9 +6678,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
6390
6678
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6391
6679
|
}
|
|
6392
6680
|
|
|
6393
|
-
if (il == n_layer - 1) {
|
|
6394
|
-
// skip computing output for unused tokens
|
|
6395
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6681
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6396
6682
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6397
6683
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
6398
6684
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
@@ -6467,6 +6753,8 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6467
6753
|
|
|
6468
6754
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6469
6755
|
|
|
6756
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6757
|
+
|
|
6470
6758
|
for (int il = 0; il < n_layer; ++il) {
|
|
6471
6759
|
ggml_tensor * inpSA = inpL;
|
|
6472
6760
|
|
|
@@ -6513,9 +6801,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6513
6801
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6514
6802
|
}
|
|
6515
6803
|
|
|
6516
|
-
if (il == n_layer - 1) {
|
|
6517
|
-
// skip computing output for unused tokens
|
|
6518
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6804
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6519
6805
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6520
6806
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6521
6807
|
}
|
|
@@ -6584,6 +6870,8 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6584
6870
|
|
|
6585
6871
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6586
6872
|
|
|
6873
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6874
|
+
|
|
6587
6875
|
for (int il = 0; il < n_layer; ++il) {
|
|
6588
6876
|
ggml_tensor * inpSA = inpL;
|
|
6589
6877
|
|
|
@@ -6633,9 +6921,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6633
6921
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6634
6922
|
}
|
|
6635
6923
|
|
|
6636
|
-
if (il == n_layer - 1) {
|
|
6637
|
-
// skip computing output for unused tokens
|
|
6638
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6924
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6639
6925
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6640
6926
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6641
6927
|
}
|
|
@@ -6705,6 +6991,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6705
6991
|
int sections[4];
|
|
6706
6992
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
6707
6993
|
|
|
6994
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6995
|
+
|
|
6708
6996
|
for (int il = 0; il < n_layer; ++il) {
|
|
6709
6997
|
ggml_tensor * inpSA = inpL;
|
|
6710
6998
|
|
|
@@ -6754,9 +7042,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6754
7042
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6755
7043
|
}
|
|
6756
7044
|
|
|
6757
|
-
if (il == n_layer - 1) {
|
|
6758
|
-
// skip computing output for unused tokens
|
|
6759
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7045
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6760
7046
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6761
7047
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6762
7048
|
}
|
|
@@ -6823,6 +7109,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6823
7109
|
|
|
6824
7110
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6825
7111
|
|
|
7112
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7113
|
+
|
|
6826
7114
|
for (int il = 0; il < n_layer; ++il) {
|
|
6827
7115
|
ggml_tensor * inpSA = inpL;
|
|
6828
7116
|
|
|
@@ -6881,9 +7169,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6881
7169
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6882
7170
|
}
|
|
6883
7171
|
|
|
6884
|
-
if (il == n_layer - 1) {
|
|
6885
|
-
// skip computing output for unused tokens
|
|
6886
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7172
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
6887
7173
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6888
7174
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6889
7175
|
}
|
|
@@ -6982,6 +7268,8 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
6982
7268
|
|
|
6983
7269
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6984
7270
|
|
|
7271
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7272
|
+
|
|
6985
7273
|
for (int il = 0; il < n_layer; ++il) {
|
|
6986
7274
|
ggml_tensor * inpSA = inpL;
|
|
6987
7275
|
|
|
@@ -7034,9 +7322,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
7034
7322
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7035
7323
|
}
|
|
7036
7324
|
|
|
7037
|
-
if (il == n_layer - 1) {
|
|
7038
|
-
// skip computing output for unused tokens
|
|
7039
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7325
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7040
7326
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7041
7327
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7042
7328
|
}
|
|
@@ -7103,6 +7389,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7103
7389
|
|
|
7104
7390
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7105
7391
|
|
|
7392
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7393
|
+
|
|
7106
7394
|
for (int il = 0; il < n_layer; ++il) {
|
|
7107
7395
|
ggml_tensor * inpSA = inpL;
|
|
7108
7396
|
|
|
@@ -7155,9 +7443,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7155
7443
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7156
7444
|
}
|
|
7157
7445
|
|
|
7158
|
-
if (il == n_layer - 1) {
|
|
7159
|
-
// skip computing output for unused tokens
|
|
7160
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7446
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7161
7447
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7162
7448
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7163
7449
|
}
|
|
@@ -7233,6 +7519,8 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7233
7519
|
|
|
7234
7520
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7235
7521
|
|
|
7522
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7523
|
+
|
|
7236
7524
|
for (int il = 0; il < n_layer; ++il) {
|
|
7237
7525
|
attn_norm_output = build_norm(inpL,
|
|
7238
7526
|
model.layers[il].attn_norm,
|
|
@@ -7295,9 +7583,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7295
7583
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7296
7584
|
}
|
|
7297
7585
|
|
|
7298
|
-
if (il == n_layer - 1) {
|
|
7299
|
-
// skip computing output for unused tokens
|
|
7300
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7586
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7301
7587
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7302
7588
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7303
7589
|
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
@@ -7369,6 +7655,8 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7369
7655
|
inp_attn = build_attn_inp_kv_unified();
|
|
7370
7656
|
}
|
|
7371
7657
|
|
|
7658
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7659
|
+
|
|
7372
7660
|
for (int il = 0; il < n_layer; ++il) {
|
|
7373
7661
|
auto * residual = inpL;
|
|
7374
7662
|
|
|
@@ -7432,9 +7720,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7432
7720
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7433
7721
|
}
|
|
7434
7722
|
|
|
7435
|
-
if (il == n_layer - 1) {
|
|
7436
|
-
// skip computing output for unused tokens
|
|
7437
|
-
ggml_tensor* inp_out_ids = build_inp_out_ids();
|
|
7723
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7438
7724
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7439
7725
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
7440
7726
|
}
|
|
@@ -7520,15 +7806,16 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
7520
7806
|
|
|
7521
7807
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7522
7808
|
|
|
7523
|
-
|
|
7809
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7524
7810
|
|
|
7811
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7525
7812
|
// norm
|
|
7526
7813
|
cur = build_norm(inpL,
|
|
7527
7814
|
model.layers[il].attn_norm, NULL,
|
|
7528
7815
|
LLM_NORM_RMS, il);
|
|
7529
7816
|
cb(cur, "attn_norm", il);
|
|
7530
7817
|
|
|
7531
|
-
ggml_tensor *
|
|
7818
|
+
ggml_tensor * sa_inp = cur;
|
|
7532
7819
|
|
|
7533
7820
|
// self-attention
|
|
7534
7821
|
{
|
|
@@ -7566,18 +7853,17 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
7566
7853
|
model.layers[il].wo, NULL,
|
|
7567
7854
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7568
7855
|
}
|
|
7569
|
-
ggml_tensor * sa_out = cur;
|
|
7570
7856
|
|
|
7571
|
-
|
|
7572
|
-
|
|
7573
|
-
if (il == n_layer - 1) {
|
|
7574
|
-
// skip computing output for unused tokens
|
|
7575
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7857
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7576
7858
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7577
|
-
|
|
7859
|
+
sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
|
|
7578
7860
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7579
7861
|
}
|
|
7580
7862
|
|
|
7863
|
+
ggml_tensor * sa_out = cur;
|
|
7864
|
+
|
|
7865
|
+
cur = sa_inp;
|
|
7866
|
+
|
|
7581
7867
|
// feed-forward network
|
|
7582
7868
|
{
|
|
7583
7869
|
cur = build_ffn(cur,
|
|
@@ -7642,6 +7928,8 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
7642
7928
|
inpL = ggml_add(ctx0, inpL, pos);
|
|
7643
7929
|
cb(inpL, "inpL", -1);
|
|
7644
7930
|
|
|
7931
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7932
|
+
|
|
7645
7933
|
for (int il = 0; il < n_layer; ++il) {
|
|
7646
7934
|
cur = build_norm(inpL,
|
|
7647
7935
|
model.layers[il].attn_norm,
|
|
@@ -7674,9 +7962,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
7674
7962
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7675
7963
|
}
|
|
7676
7964
|
|
|
7677
|
-
if (il == n_layer - 1) {
|
|
7678
|
-
// skip computing output for unused tokens
|
|
7679
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7965
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7680
7966
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7681
7967
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7682
7968
|
}
|
|
@@ -7746,6 +8032,8 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
7746
8032
|
|
|
7747
8033
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7748
8034
|
|
|
8035
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8036
|
+
|
|
7749
8037
|
for (int il = 0; il < n_layer; ++il) {
|
|
7750
8038
|
cur = build_norm(inpL,
|
|
7751
8039
|
model.layers[il].attn_norm,
|
|
@@ -7790,9 +8078,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
7790
8078
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7791
8079
|
}
|
|
7792
8080
|
|
|
7793
|
-
if (il == n_layer - 1) {
|
|
7794
|
-
// skip computing output for unused tokens
|
|
7795
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8081
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7796
8082
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7797
8083
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7798
8084
|
}
|
|
@@ -7846,128 +8132,128 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
7846
8132
|
|
|
7847
8133
|
struct llm_build_orion : public llm_graph_context {
|
|
7848
8134
|
llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7849
|
-
|
|
8135
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7850
8136
|
|
|
7851
|
-
|
|
7852
|
-
|
|
8137
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8138
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
7853
8139
|
|
|
7854
|
-
|
|
7855
|
-
|
|
8140
|
+
ggml_tensor * cur;
|
|
8141
|
+
ggml_tensor * inpL;
|
|
7856
8142
|
|
|
7857
|
-
|
|
8143
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7858
8144
|
|
|
7859
|
-
|
|
7860
|
-
|
|
8145
|
+
// inp_pos - contains the positions
|
|
8146
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7861
8147
|
|
|
7862
|
-
|
|
8148
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7863
8149
|
|
|
7864
|
-
|
|
7865
|
-
ggml_tensor * inpSA = inpL;
|
|
8150
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7866
8151
|
|
|
7867
|
-
|
|
7868
|
-
|
|
7869
|
-
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
7870
|
-
LLM_NORM, il);
|
|
7871
|
-
cb(cur, "attn_norm", il);
|
|
8152
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
8153
|
+
ggml_tensor * inpSA = inpL;
|
|
7872
8154
|
|
|
7873
|
-
|
|
7874
|
-
|
|
7875
|
-
|
|
7876
|
-
|
|
7877
|
-
cb(
|
|
7878
|
-
// if (model.layers[il].bq) {
|
|
7879
|
-
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
7880
|
-
// cb(Qcur, "Qcur", il);
|
|
7881
|
-
// }
|
|
7882
|
-
|
|
7883
|
-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
7884
|
-
cb(Kcur, "Kcur", il);
|
|
7885
|
-
// if (model.layers[il].bk) {
|
|
7886
|
-
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
7887
|
-
// cb(Kcur, "Kcur", il);
|
|
7888
|
-
// }
|
|
7889
|
-
|
|
7890
|
-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
7891
|
-
cb(Vcur, "Vcur", il);
|
|
7892
|
-
// if (model.layers[il].bv) {
|
|
7893
|
-
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
7894
|
-
// cb(Vcur, "Vcur", il);
|
|
7895
|
-
// }
|
|
7896
|
-
|
|
7897
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7898
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7899
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7900
|
-
|
|
7901
|
-
Qcur = ggml_rope_ext(
|
|
7902
|
-
ctx0, Qcur, inp_pos, nullptr,
|
|
7903
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7904
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7905
|
-
);
|
|
8155
|
+
// norm
|
|
8156
|
+
cur = build_norm(inpL,
|
|
8157
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
8158
|
+
LLM_NORM, il);
|
|
8159
|
+
cb(cur, "attn_norm", il);
|
|
7906
8160
|
|
|
7907
|
-
|
|
7908
|
-
|
|
7909
|
-
|
|
7910
|
-
|
|
7911
|
-
|
|
8161
|
+
// self-attention
|
|
8162
|
+
{
|
|
8163
|
+
// compute Q and K and RoPE them
|
|
8164
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
8165
|
+
cb(Qcur, "Qcur", il);
|
|
8166
|
+
// if (model.layers[il].bq) {
|
|
8167
|
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
8168
|
+
// cb(Qcur, "Qcur", il);
|
|
8169
|
+
// }
|
|
7912
8170
|
|
|
7913
|
-
|
|
7914
|
-
|
|
7915
|
-
|
|
8171
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
8172
|
+
cb(Kcur, "Kcur", il);
|
|
8173
|
+
// if (model.layers[il].bk) {
|
|
8174
|
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
8175
|
+
// cb(Kcur, "Kcur", il);
|
|
8176
|
+
// }
|
|
7916
8177
|
|
|
7917
|
-
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
|
|
8178
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
8179
|
+
cb(Vcur, "Vcur", il);
|
|
8180
|
+
// if (model.layers[il].bv) {
|
|
8181
|
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8182
|
+
// cb(Vcur, "Vcur", il);
|
|
8183
|
+
// }
|
|
7921
8184
|
|
|
7922
|
-
|
|
7923
|
-
|
|
7924
|
-
|
|
7925
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7926
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7927
|
-
}
|
|
8185
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8186
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8187
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7928
8188
|
|
|
7929
|
-
|
|
7930
|
-
|
|
8189
|
+
Qcur = ggml_rope_ext(
|
|
8190
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8191
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8192
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8193
|
+
);
|
|
7931
8194
|
|
|
7932
|
-
|
|
7933
|
-
|
|
7934
|
-
|
|
7935
|
-
|
|
7936
|
-
|
|
8195
|
+
Kcur = ggml_rope_ext(
|
|
8196
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8197
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8198
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8199
|
+
);
|
|
7937
8200
|
|
|
7938
|
-
|
|
7939
|
-
|
|
7940
|
-
|
|
7941
|
-
|
|
7942
|
-
|
|
7943
|
-
|
|
7944
|
-
|
|
8201
|
+
cb(Qcur, "Qcur", il);
|
|
8202
|
+
cb(Kcur, "Kcur", il);
|
|
8203
|
+
cb(Vcur, "Vcur", il);
|
|
8204
|
+
|
|
8205
|
+
cur = build_attn(inp_attn, gf,
|
|
8206
|
+
model.layers[il].wo, NULL,
|
|
8207
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8208
|
+
}
|
|
7945
8209
|
|
|
7946
|
-
|
|
8210
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8211
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8212
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8213
|
+
}
|
|
7947
8214
|
|
|
7948
|
-
|
|
7949
|
-
|
|
8215
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
8216
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
7950
8217
|
|
|
7951
|
-
|
|
7952
|
-
|
|
7953
|
-
|
|
8218
|
+
// feed-forward network
|
|
8219
|
+
cur = build_norm(ffn_inp,
|
|
8220
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
8221
|
+
LLM_NORM, il);
|
|
8222
|
+
cb(cur, "ffn_norm", il);
|
|
8223
|
+
|
|
8224
|
+
cur = build_ffn(cur,
|
|
8225
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
8226
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
8227
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
8228
|
+
NULL,
|
|
8229
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
8230
|
+
cb(cur, "ffn_out", il);
|
|
8231
|
+
|
|
8232
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8233
|
+
|
|
8234
|
+
cur = build_cvec(cur, il);
|
|
8235
|
+
cb(cur, "l_out", il);
|
|
7954
8236
|
|
|
7955
|
-
|
|
8237
|
+
// input for next layer
|
|
8238
|
+
inpL = cur;
|
|
8239
|
+
}
|
|
8240
|
+
|
|
8241
|
+
cur = inpL;
|
|
7956
8242
|
|
|
7957
|
-
|
|
7958
|
-
|
|
7959
|
-
|
|
8243
|
+
cur = build_norm(cur,
|
|
8244
|
+
model.output_norm, model.output_norm_b,
|
|
8245
|
+
LLM_NORM, -1);
|
|
7960
8246
|
|
|
7961
|
-
|
|
7962
|
-
|
|
8247
|
+
cb(cur, "result_norm", -1);
|
|
8248
|
+
res->t_embd = cur;
|
|
7963
8249
|
|
|
7964
|
-
|
|
7965
|
-
|
|
8250
|
+
// lm_head
|
|
8251
|
+
cur = build_lora_mm(model.output, cur);
|
|
7966
8252
|
|
|
7967
|
-
|
|
7968
|
-
|
|
8253
|
+
cb(cur, "result_output", -1);
|
|
8254
|
+
res->t_logits = cur;
|
|
7969
8255
|
|
|
7970
|
-
|
|
8256
|
+
ggml_build_forward_expand(gf, cur);
|
|
7971
8257
|
}
|
|
7972
8258
|
};
|
|
7973
8259
|
|
|
@@ -7988,6 +8274,8 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
7988
8274
|
|
|
7989
8275
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7990
8276
|
|
|
8277
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8278
|
+
|
|
7991
8279
|
for (int il = 0; il < n_layer; ++il) {
|
|
7992
8280
|
ggml_tensor * inpSA = inpL;
|
|
7993
8281
|
|
|
@@ -8046,9 +8334,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8046
8334
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8047
8335
|
}
|
|
8048
8336
|
|
|
8049
|
-
if (il == n_layer - 1) {
|
|
8050
|
-
// skip computing output for unused tokens
|
|
8051
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8337
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8052
8338
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8053
8339
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8054
8340
|
}
|
|
@@ -8124,6 +8410,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8124
8410
|
|
|
8125
8411
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8126
8412
|
|
|
8413
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8414
|
+
|
|
8127
8415
|
for (int il = 0; il < n_layer; ++il) {
|
|
8128
8416
|
ggml_tensor * inpSA = inpL;
|
|
8129
8417
|
|
|
@@ -8243,15 +8531,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8243
8531
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
8244
8532
|
}
|
|
8245
8533
|
|
|
8246
|
-
if (il == n_layer - 1) {
|
|
8247
|
-
// skip computing output for unused tokens
|
|
8248
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8534
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8249
8535
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8250
8536
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8251
8537
|
}
|
|
8252
8538
|
|
|
8253
8539
|
// scale_res - scale the hidden states for residual connection
|
|
8254
|
-
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
|
8540
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
|
|
8255
8541
|
cur = ggml_scale(ctx0, cur, scale_res);
|
|
8256
8542
|
cb(cur, "hidden_scaled", il);
|
|
8257
8543
|
|
|
@@ -8328,6 +8614,8 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8328
8614
|
|
|
8329
8615
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8330
8616
|
|
|
8617
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8618
|
+
|
|
8331
8619
|
for (int il = 0; il < n_layer; ++il) {
|
|
8332
8620
|
// norm
|
|
8333
8621
|
cur = build_norm(inpL,
|
|
@@ -8373,9 +8661,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
8373
8661
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8374
8662
|
}
|
|
8375
8663
|
|
|
8376
|
-
if (il == n_layer - 1) {
|
|
8377
|
-
// skip computing output for unused tokens
|
|
8378
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8664
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8379
8665
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8380
8666
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8381
8667
|
}
|
|
@@ -8444,6 +8730,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8444
8730
|
|
|
8445
8731
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8446
8732
|
|
|
8733
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8734
|
+
|
|
8447
8735
|
for (int il = 0; il < n_layer; ++il) {
|
|
8448
8736
|
// norm
|
|
8449
8737
|
cur = build_norm(inpL,
|
|
@@ -8481,32 +8769,23 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
8481
8769
|
cb(Kcur, "Kcur", il);
|
|
8482
8770
|
cb(Vcur, "Vcur", il);
|
|
8483
8771
|
|
|
8484
|
-
|
|
8485
|
-
switch (model.type) {
|
|
8486
|
-
case LLM_TYPE_2B:
|
|
8487
|
-
case LLM_TYPE_9B:
|
|
8488
|
-
case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
|
|
8489
|
-
default: GGML_ABORT("fatal error");
|
|
8490
|
-
};
|
|
8491
|
-
cb(Qcur, "Qcur_scaled", il);
|
|
8772
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8492
8773
|
|
|
8493
8774
|
cur = build_attn(inp_attn, gf,
|
|
8494
8775
|
model.layers[il].wo, NULL,
|
|
8495
8776
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8496
8777
|
}
|
|
8497
8778
|
|
|
8779
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8780
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8781
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8782
|
+
}
|
|
8783
|
+
|
|
8498
8784
|
cur = build_norm(cur,
|
|
8499
8785
|
model.layers[il].attn_post_norm, NULL,
|
|
8500
8786
|
LLM_NORM_RMS, il);
|
|
8501
8787
|
cb(cur, "attn_post_norm", il);
|
|
8502
8788
|
|
|
8503
|
-
if (il == n_layer - 1) {
|
|
8504
|
-
// skip computing output for unused tokens
|
|
8505
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8506
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8507
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8508
|
-
}
|
|
8509
|
-
|
|
8510
8789
|
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
8511
8790
|
cb(sa_out, "sa_out", il);
|
|
8512
8791
|
|
|
@@ -8585,6 +8864,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8585
8864
|
// TODO: is causal == true correct? might need some changes
|
|
8586
8865
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
8587
8866
|
|
|
8867
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8868
|
+
|
|
8588
8869
|
for (int il = 0; il < n_layer; ++il) {
|
|
8589
8870
|
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
8590
8871
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
@@ -8629,9 +8910,17 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8629
8910
|
cb(Kcur, "Kcur", il);
|
|
8630
8911
|
cb(Vcur, "Vcur", il);
|
|
8631
8912
|
|
|
8913
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
8914
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
8915
|
+
|
|
8632
8916
|
cur = build_attn(inp_attn, gf,
|
|
8633
8917
|
model.layers[il].wo, NULL,
|
|
8634
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
8918
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8919
|
+
}
|
|
8920
|
+
|
|
8921
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8922
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8923
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8635
8924
|
}
|
|
8636
8925
|
|
|
8637
8926
|
cur = build_norm(cur,
|
|
@@ -8639,13 +8928,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
8639
8928
|
LLM_NORM_RMS, il);
|
|
8640
8929
|
cb(cur, "attn_post_norm", il);
|
|
8641
8930
|
|
|
8642
|
-
if (il == n_layer - 1) {
|
|
8643
|
-
// skip computing output for unused tokens
|
|
8644
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8645
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8646
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8647
|
-
}
|
|
8648
|
-
|
|
8649
8931
|
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
8650
8932
|
cb(sa_out, "sa_out", il);
|
|
8651
8933
|
|
|
@@ -8716,6 +8998,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
8716
8998
|
|
|
8717
8999
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
8718
9000
|
|
|
9001
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9002
|
+
|
|
8719
9003
|
for (int il = 0; il < n_layer; ++il) {
|
|
8720
9004
|
ggml_tensor * inpSA = inpL;
|
|
8721
9005
|
|
|
@@ -8774,9 +9058,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
8774
9058
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8775
9059
|
}
|
|
8776
9060
|
|
|
8777
|
-
if (il == n_layer - 1) {
|
|
8778
|
-
// skip computing output for unused tokens
|
|
8779
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9061
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8780
9062
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8781
9063
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
8782
9064
|
}
|
|
@@ -8837,8 +9119,9 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8837
9119
|
// {n_embd, n_tokens}
|
|
8838
9120
|
inpL = build_inp_embd(model.tok_embd);
|
|
8839
9121
|
|
|
8840
|
-
|
|
8841
|
-
|
|
9122
|
+
auto * rs_inp = build_rs_inp();
|
|
9123
|
+
|
|
9124
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8842
9125
|
|
|
8843
9126
|
for (int il = 0; il < n_layer; ++il) {
|
|
8844
9127
|
// norm
|
|
@@ -8847,12 +9130,9 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8847
9130
|
LLM_NORM_RMS, il);
|
|
8848
9131
|
cb(cur, "attn_norm", il);
|
|
8849
9132
|
|
|
8850
|
-
|
|
8851
|
-
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
|
9133
|
+
cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
|
|
8852
9134
|
|
|
8853
|
-
if (il == n_layer - 1) {
|
|
8854
|
-
// skip computing output for unused tokens
|
|
8855
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9135
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
8856
9136
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8857
9137
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
8858
9138
|
}
|
|
@@ -8886,15 +9166,14 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8886
9166
|
|
|
8887
9167
|
// TODO: split
|
|
8888
9168
|
ggml_tensor * build_mamba_layer(
|
|
8889
|
-
|
|
8890
|
-
|
|
8891
|
-
|
|
8892
|
-
|
|
8893
|
-
|
|
8894
|
-
|
|
8895
|
-
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
|
9169
|
+
llm_graph_input_rs * inp,
|
|
9170
|
+
ggml_cgraph * gf,
|
|
9171
|
+
ggml_tensor * cur,
|
|
9172
|
+
const llama_ubatch & ubatch,
|
|
9173
|
+
int il) const {
|
|
9174
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
8896
9175
|
|
|
8897
|
-
const auto kv_head =
|
|
9176
|
+
const auto kv_head = mctx_cur->get_head();
|
|
8898
9177
|
|
|
8899
9178
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
8900
9179
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
@@ -8912,17 +9191,17 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8912
9191
|
GGML_ASSERT(ubatch.equal_seqs);
|
|
8913
9192
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
8914
9193
|
|
|
8915
|
-
ggml_tensor * conv_states_all =
|
|
8916
|
-
ggml_tensor * ssm_states_all =
|
|
9194
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
9195
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
8917
9196
|
|
|
8918
9197
|
// (ab)using the KV cache to store the states
|
|
8919
|
-
ggml_tensor * conv =
|
|
8920
|
-
gf, conv_states_all,
|
|
8921
|
-
hparams.
|
|
9198
|
+
ggml_tensor * conv = build_rs(
|
|
9199
|
+
inp, gf, conv_states_all,
|
|
9200
|
+
hparams.n_embd_r(), n_seqs);
|
|
8922
9201
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
8923
|
-
ggml_tensor * ssm =
|
|
8924
|
-
gf, ssm_states_all,
|
|
8925
|
-
hparams.
|
|
9202
|
+
ggml_tensor * ssm = build_rs(
|
|
9203
|
+
inp, gf, ssm_states_all,
|
|
9204
|
+
hparams.n_embd_s(), n_seqs);
|
|
8926
9205
|
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
8927
9206
|
|
|
8928
9207
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -9035,13 +9314,15 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9035
9314
|
|
|
9036
9315
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9037
9316
|
|
|
9038
|
-
|
|
9317
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9039
9318
|
|
|
9319
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9040
9320
|
// norm
|
|
9041
9321
|
cur = build_norm(inpL,
|
|
9042
9322
|
model.layers[il].attn_norm, NULL,
|
|
9043
9323
|
LLM_NORM, il);
|
|
9044
9324
|
cb(cur, "attn_norm", il);
|
|
9325
|
+
|
|
9045
9326
|
ggml_tensor * ffn_inp = cur;
|
|
9046
9327
|
|
|
9047
9328
|
// self-attention
|
|
@@ -9109,9 +9390,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
9109
9390
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9110
9391
|
}
|
|
9111
9392
|
|
|
9112
|
-
if (il == n_layer - 1) {
|
|
9113
|
-
// skip computing output for unused tokens
|
|
9114
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9393
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9115
9394
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9116
9395
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9117
9396
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
@@ -9182,6 +9461,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
9182
9461
|
|
|
9183
9462
|
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
9184
9463
|
|
|
9464
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9465
|
+
|
|
9185
9466
|
for (int il = 0; il < n_layer; ++il) {
|
|
9186
9467
|
const bool is_swa = hparams.is_swa(il);
|
|
9187
9468
|
|
|
@@ -9244,9 +9525,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
9244
9525
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9245
9526
|
}
|
|
9246
9527
|
|
|
9247
|
-
if (il == n_layer - 1) {
|
|
9248
|
-
// skip computing output for unused tokens
|
|
9249
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9528
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9250
9529
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9251
9530
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9252
9531
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
@@ -9317,6 +9596,8 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
9317
9596
|
|
|
9318
9597
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9319
9598
|
|
|
9599
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9600
|
+
|
|
9320
9601
|
for (int il = 0; il < n_layer; ++il) {
|
|
9321
9602
|
ggml_tensor * inpSA = inpL;
|
|
9322
9603
|
|
|
@@ -9375,9 +9656,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
9375
9656
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9376
9657
|
}
|
|
9377
9658
|
|
|
9378
|
-
if (il == n_layer - 1) {
|
|
9379
|
-
// skip computing output for unused tokens
|
|
9380
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9659
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9381
9660
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9382
9661
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9383
9662
|
}
|
|
@@ -9445,6 +9724,8 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
9445
9724
|
|
|
9446
9725
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9447
9726
|
|
|
9727
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9728
|
+
|
|
9448
9729
|
for (int il = 0; il < n_layer; ++il) {
|
|
9449
9730
|
ggml_tensor * inpSA = inpL;
|
|
9450
9731
|
|
|
@@ -9495,18 +9776,16 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
9495
9776
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9496
9777
|
}
|
|
9497
9778
|
|
|
9779
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9780
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9781
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9782
|
+
}
|
|
9783
|
+
|
|
9498
9784
|
cur = build_norm(cur,
|
|
9499
9785
|
model.layers[il].attn_post_norm, NULL,
|
|
9500
9786
|
LLM_NORM_RMS, il);
|
|
9501
9787
|
cb(cur, "attn_post_norm", il);
|
|
9502
9788
|
|
|
9503
|
-
if (il == n_layer - 1) {
|
|
9504
|
-
// skip computing output for unused tokens
|
|
9505
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9506
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9507
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9508
|
-
}
|
|
9509
|
-
|
|
9510
9789
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
9511
9790
|
cb(ffn_inp, "ffn_inp", il);
|
|
9512
9791
|
|
|
@@ -9574,6 +9853,8 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
9574
9853
|
|
|
9575
9854
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9576
9855
|
|
|
9856
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9857
|
+
|
|
9577
9858
|
for (int il = 0; il < n_layer; ++il) {
|
|
9578
9859
|
ggml_tensor * inpSA = inpL;
|
|
9579
9860
|
|
|
@@ -9628,9 +9909,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
9628
9909
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9629
9910
|
}
|
|
9630
9911
|
|
|
9631
|
-
if (il == n_layer - 1) {
|
|
9632
|
-
// skip computing output for unused tokens
|
|
9633
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9912
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9634
9913
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9635
9914
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
9636
9915
|
}
|
|
@@ -9700,6 +9979,8 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
9700
9979
|
|
|
9701
9980
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9702
9981
|
|
|
9982
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9983
|
+
|
|
9703
9984
|
for (int il = 0; il < n_layer; ++il) {
|
|
9704
9985
|
const int64_t n_head = hparams.n_head(il);
|
|
9705
9986
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
@@ -9761,11 +10042,9 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
9761
10042
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9762
10043
|
}
|
|
9763
10044
|
|
|
9764
|
-
if (il == n_layer - 1) {
|
|
9765
|
-
// skip computing output for unused tokens
|
|
9766
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10045
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9767
10046
|
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
9768
|
-
cur
|
|
10047
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9769
10048
|
}
|
|
9770
10049
|
|
|
9771
10050
|
ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
|
@@ -9831,6 +10110,8 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
9831
10110
|
|
|
9832
10111
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9833
10112
|
|
|
10113
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10114
|
+
|
|
9834
10115
|
for (int il = 0; il < n_layer; ++il) {
|
|
9835
10116
|
cur = build_norm(inpL,
|
|
9836
10117
|
model.layers[il].attn_norm,
|
|
@@ -9875,9 +10156,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
9875
10156
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9876
10157
|
}
|
|
9877
10158
|
|
|
9878
|
-
if (il == n_layer - 1) {
|
|
9879
|
-
// skip computing output for unused tokens
|
|
9880
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10159
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
9881
10160
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9882
10161
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9883
10162
|
}
|
|
@@ -9979,6 +10258,8 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
9979
10258
|
|
|
9980
10259
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
9981
10260
|
|
|
10261
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10262
|
+
|
|
9982
10263
|
for (int il = 0; il < n_layer; ++il) {
|
|
9983
10264
|
ggml_tensor * inpSA = inpL;
|
|
9984
10265
|
|
|
@@ -10025,9 +10306,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
10025
10306
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10026
10307
|
}
|
|
10027
10308
|
|
|
10028
|
-
if (il == n_layer - 1) {
|
|
10029
|
-
// skip computing output for unused tokens
|
|
10030
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10309
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10031
10310
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10032
10311
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10033
10312
|
}
|
|
@@ -10119,6 +10398,8 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
10119
10398
|
|
|
10120
10399
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
10121
10400
|
|
|
10401
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10402
|
+
|
|
10122
10403
|
for (int il = 0; il < n_layer; ++il) {
|
|
10123
10404
|
ggml_tensor * inpSA = inpL;
|
|
10124
10405
|
|
|
@@ -10180,14 +10461,11 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
10180
10461
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
10181
10462
|
}
|
|
10182
10463
|
|
|
10183
|
-
if (il == n_layer - 1) {
|
|
10184
|
-
// skip computing output for unused tokens
|
|
10185
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10464
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10186
10465
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10187
10466
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10188
10467
|
}
|
|
10189
10468
|
|
|
10190
|
-
|
|
10191
10469
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
10192
10470
|
cb(ffn_inp, "ffn_inp", il);
|
|
10193
10471
|
|
|
@@ -10295,6 +10573,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
10295
10573
|
|
|
10296
10574
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10297
10575
|
|
|
10576
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10577
|
+
|
|
10298
10578
|
for (int il = 0; il < n_layer; ++il) {
|
|
10299
10579
|
ggml_tensor * inpSA = inpL;
|
|
10300
10580
|
|
|
@@ -10444,9 +10724,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
10444
10724
|
}
|
|
10445
10725
|
}
|
|
10446
10726
|
|
|
10447
|
-
if (il == n_layer - 1) {
|
|
10448
|
-
// skip computing output for unused tokens
|
|
10449
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10727
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10450
10728
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10451
10729
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10452
10730
|
}
|
|
@@ -10542,6 +10820,8 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
10542
10820
|
|
|
10543
10821
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10544
10822
|
|
|
10823
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10824
|
+
|
|
10545
10825
|
for (int il = 0; il < n_layer; ++il) {
|
|
10546
10826
|
ggml_tensor * inpSA = inpL;
|
|
10547
10827
|
|
|
@@ -10624,9 +10904,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
10624
10904
|
cb(cur, "attn_o_out", il);
|
|
10625
10905
|
}
|
|
10626
10906
|
|
|
10627
|
-
if (il == n_layer - 1) {
|
|
10628
|
-
// skip computing output for unused tokens
|
|
10629
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10907
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10630
10908
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10631
10909
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10632
10910
|
}
|
|
@@ -10701,6 +10979,8 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
10701
10979
|
|
|
10702
10980
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
10703
10981
|
|
|
10982
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10983
|
+
|
|
10704
10984
|
for (int il = 0; il < n_layer; ++il) {
|
|
10705
10985
|
ggml_tensor * inpSA = inpL;
|
|
10706
10986
|
|
|
@@ -10734,9 +11014,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
10734
11014
|
cb(cur, "kqv_out", il);
|
|
10735
11015
|
}
|
|
10736
11016
|
|
|
10737
|
-
if (il == n_layer - 1) {
|
|
10738
|
-
// skip computing output for unused tokens
|
|
10739
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11017
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10740
11018
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10741
11019
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10742
11020
|
}
|
|
@@ -10807,6 +11085,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
10807
11085
|
auto * inp_attn_self = build_attn_inp_kv_unified();
|
|
10808
11086
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
10809
11087
|
|
|
11088
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11089
|
+
|
|
10810
11090
|
for (int il = 0; il < n_layer; ++il) {
|
|
10811
11091
|
ggml_tensor * inpSA = inpL;
|
|
10812
11092
|
|
|
@@ -10898,11 +11178,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
10898
11178
|
//cb(cur, "kqv_out", il);
|
|
10899
11179
|
}
|
|
10900
11180
|
|
|
10901
|
-
if (il == n_layer - 1) {
|
|
10902
|
-
// skip computing output for unused tokens
|
|
10903
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11181
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10904
11182
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10905
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10906
11183
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
|
10907
11184
|
}
|
|
10908
11185
|
|
|
@@ -10972,6 +11249,8 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
10972
11249
|
|
|
10973
11250
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10974
11251
|
|
|
11252
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11253
|
+
|
|
10975
11254
|
for (int il = 0; il < n_layer; ++il) {
|
|
10976
11255
|
cur = build_norm(inpL,
|
|
10977
11256
|
model.layers[il].attn_norm,
|
|
@@ -11004,9 +11283,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
11004
11283
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
11005
11284
|
}
|
|
11006
11285
|
|
|
11007
|
-
if (il == n_layer - 1) {
|
|
11008
|
-
// skip computing output for unused tokens
|
|
11009
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11286
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11010
11287
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11011
11288
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
11012
11289
|
}
|
|
@@ -11070,6 +11347,8 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11070
11347
|
|
|
11071
11348
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11072
11349
|
|
|
11350
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11351
|
+
|
|
11073
11352
|
for (int il = 0; il < n_layer; ++il) {
|
|
11074
11353
|
ggml_tensor * inpSA = inpL;
|
|
11075
11354
|
|
|
@@ -11136,9 +11415,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11136
11415
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11137
11416
|
}
|
|
11138
11417
|
|
|
11139
|
-
if (il == n_layer - 1) {
|
|
11140
|
-
// skip computing output for unused tokens
|
|
11141
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11418
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11142
11419
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11143
11420
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11144
11421
|
}
|
|
@@ -11203,6 +11480,8 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
11203
11480
|
|
|
11204
11481
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11205
11482
|
|
|
11483
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11484
|
+
|
|
11206
11485
|
for (int il = 0; il < n_layer; ++il) {
|
|
11207
11486
|
ggml_tensor * inpSA = inpL;
|
|
11208
11487
|
|
|
@@ -11269,9 +11548,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
11269
11548
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11270
11549
|
}
|
|
11271
11550
|
|
|
11272
|
-
if (il == n_layer - 1) {
|
|
11273
|
-
// skip computing output for unused tokens
|
|
11274
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11551
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11275
11552
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11276
11553
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11277
11554
|
}
|
|
@@ -11354,6 +11631,8 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
11354
11631
|
|
|
11355
11632
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11356
11633
|
|
|
11634
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11635
|
+
|
|
11357
11636
|
for (int il = 0; il < n_layer; ++il) {
|
|
11358
11637
|
ggml_tensor * inpSA = inpL;
|
|
11359
11638
|
|
|
@@ -11413,9 +11692,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
11413
11692
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11414
11693
|
}
|
|
11415
11694
|
|
|
11416
|
-
if (il == n_layer - 1) {
|
|
11417
|
-
// skip computing output for unused tokens
|
|
11418
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11695
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11419
11696
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11420
11697
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11421
11698
|
}
|
|
@@ -11483,6 +11760,8 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11483
11760
|
|
|
11484
11761
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11485
11762
|
|
|
11763
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11764
|
+
|
|
11486
11765
|
for (int il = 0; il < n_layer; ++il) {
|
|
11487
11766
|
ggml_tensor * inpSA = inpL;
|
|
11488
11767
|
|
|
@@ -11544,9 +11823,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
11544
11823
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11545
11824
|
}
|
|
11546
11825
|
|
|
11547
|
-
if (il == n_layer - 1) {
|
|
11548
|
-
// skip computing output for unused tokens
|
|
11549
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11826
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11550
11827
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11551
11828
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11552
11829
|
}
|
|
@@ -11633,14 +11910,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11633
11910
|
}
|
|
11634
11911
|
|
|
11635
11912
|
ggml_tensor * build_rwkv6_time_mix(
|
|
11913
|
+
llm_graph_input_rs * inp,
|
|
11636
11914
|
ggml_cgraph * gf,
|
|
11637
11915
|
ggml_tensor * cur,
|
|
11638
11916
|
ggml_tensor * x_prev,
|
|
11639
|
-
ggml_tensor * state_copy,
|
|
11640
|
-
ggml_tensor * state_mask,
|
|
11641
11917
|
const llama_ubatch & ubatch,
|
|
11642
11918
|
int il) const {
|
|
11643
|
-
const
|
|
11919
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
11644
11920
|
|
|
11645
11921
|
const auto n_tokens = ubatch.n_tokens;
|
|
11646
11922
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11650,7 +11926,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11650
11926
|
const auto n_head = n_embd / head_size;
|
|
11651
11927
|
const auto n_head_kv = hparams.n_head_kv(il);
|
|
11652
11928
|
|
|
11653
|
-
const auto kv_head =
|
|
11929
|
+
const auto kv_head = mctx_cur->get_head();
|
|
11654
11930
|
|
|
11655
11931
|
const auto & layer = model.layers[il];
|
|
11656
11932
|
|
|
@@ -11761,9 +12037,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11761
12037
|
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
|
11762
12038
|
}
|
|
11763
12039
|
|
|
11764
|
-
ggml_tensor * wkv_state =
|
|
11765
|
-
gf,
|
|
11766
|
-
hparams.
|
|
12040
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12041
|
+
inp, gf, mctx_cur->get_s_l(il),
|
|
12042
|
+
hparams.n_embd_s(), n_seqs);
|
|
11767
12043
|
|
|
11768
12044
|
ggml_tensor * wkv_output;
|
|
11769
12045
|
if (is_qrwkv) {
|
|
@@ -11781,9 +12057,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11781
12057
|
wkv_state,
|
|
11782
12058
|
ggml_view_1d(
|
|
11783
12059
|
ctx0,
|
|
11784
|
-
|
|
11785
|
-
hparams.
|
|
11786
|
-
hparams.
|
|
12060
|
+
mctx_cur->get_s_l(il),
|
|
12061
|
+
hparams.n_embd_s() * n_seqs,
|
|
12062
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
|
|
11787
12063
|
)
|
|
11788
12064
|
)
|
|
11789
12065
|
);
|
|
@@ -11817,20 +12093,19 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11817
12093
|
inpL = build_inp_embd(model.tok_embd);
|
|
11818
12094
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
11819
12095
|
|
|
11820
|
-
|
|
11821
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12096
|
+
auto * rs_inp = build_rs_inp();
|
|
11822
12097
|
|
|
11823
12098
|
const auto n_embd = hparams.n_embd;
|
|
11824
12099
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
11825
12100
|
const auto n_seqs = ubatch.n_seqs;
|
|
11826
12101
|
|
|
12102
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12103
|
+
|
|
11827
12104
|
for (int il = 0; il < n_layer; ++il) {
|
|
11828
12105
|
const llama_layer * layer = &model.layers[il];
|
|
11829
12106
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11830
12107
|
|
|
11831
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11832
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11833
|
-
);
|
|
12108
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11834
12109
|
|
|
11835
12110
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
11836
12111
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -11845,7 +12120,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11845
12120
|
1
|
|
11846
12121
|
);
|
|
11847
12122
|
|
|
11848
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12123
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11849
12124
|
|
|
11850
12125
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
11851
12126
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -11867,13 +12142,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11867
12142
|
);
|
|
11868
12143
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
11869
12144
|
|
|
11870
|
-
|
|
11871
|
-
|
|
11872
|
-
|
|
11873
|
-
|
|
11874
|
-
|
|
11875
|
-
|
|
11876
|
-
|
|
12145
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12146
|
+
ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
|
|
12147
|
+
x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
|
|
12148
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12149
|
+
|
|
12150
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12151
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12152
|
+
ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
|
|
12153
|
+
x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
|
|
12154
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11877
12155
|
}
|
|
11878
12156
|
|
|
11879
12157
|
cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
|
|
@@ -11908,27 +12186,26 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
11908
12186
|
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
|
11909
12187
|
struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
11910
12188
|
llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
|
|
11911
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12189
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
11912
12190
|
|
|
11913
12191
|
ggml_tensor * cur;
|
|
11914
12192
|
ggml_tensor * inpL;
|
|
11915
12193
|
|
|
11916
12194
|
inpL = build_inp_embd(model.tok_embd);
|
|
11917
12195
|
|
|
11918
|
-
|
|
11919
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12196
|
+
auto * rs_inp = build_rs_inp();
|
|
11920
12197
|
|
|
11921
12198
|
const auto n_embd = hparams.n_embd;
|
|
11922
12199
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
11923
12200
|
const auto n_seqs = ubatch.n_seqs;
|
|
11924
12201
|
|
|
12202
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12203
|
+
|
|
11925
12204
|
for (int il = 0; il < n_layer; ++il) {
|
|
11926
12205
|
const llama_layer * layer = &model.layers[il];
|
|
11927
12206
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
11928
12207
|
|
|
11929
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
11930
|
-
gf, state_copy, state_mask, ubatch, il
|
|
11931
|
-
);
|
|
12208
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
11932
12209
|
|
|
11933
12210
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
11934
12211
|
cb(att_norm, "attn_norm", il);
|
|
@@ -11940,7 +12217,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11940
12217
|
1
|
|
11941
12218
|
);
|
|
11942
12219
|
|
|
11943
|
-
cur = build_rwkv6_time_mix(gf, att_norm, x_prev,
|
|
12220
|
+
cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
|
|
11944
12221
|
|
|
11945
12222
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
11946
12223
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -11948,11 +12225,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
11948
12225
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
11949
12226
|
cb(ffn_inp, "ffn_inp", il);
|
|
11950
12227
|
|
|
11951
|
-
|
|
11952
|
-
|
|
11953
|
-
|
|
11954
|
-
|
|
11955
|
-
|
|
12228
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12229
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12230
|
+
|
|
12231
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12232
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12233
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
11956
12234
|
}
|
|
11957
12235
|
|
|
11958
12236
|
// feed-forward network
|
|
@@ -12028,15 +12306,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12028
12306
|
}
|
|
12029
12307
|
|
|
12030
12308
|
ggml_tensor * build_rwkv7_time_mix(
|
|
12309
|
+
llm_graph_input_rs * inp,
|
|
12031
12310
|
ggml_cgraph * gf,
|
|
12032
12311
|
ggml_tensor * cur,
|
|
12033
12312
|
ggml_tensor * x_prev,
|
|
12034
|
-
ggml_tensor * state_copy,
|
|
12035
|
-
ggml_tensor * state_mask,
|
|
12036
12313
|
ggml_tensor *& first_layer_value,
|
|
12037
12314
|
const llama_ubatch & ubatch,
|
|
12038
12315
|
int il) const {
|
|
12039
|
-
const
|
|
12316
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
12040
12317
|
|
|
12041
12318
|
const auto n_tokens = ubatch.n_tokens;
|
|
12042
12319
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12045,7 +12322,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12045
12322
|
const auto head_count = n_embd / head_size;
|
|
12046
12323
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12047
12324
|
|
|
12048
|
-
const auto kv_head =
|
|
12325
|
+
const auto kv_head = mctx_cur->get_head();
|
|
12049
12326
|
|
|
12050
12327
|
const auto & layer = model.layers[il];
|
|
12051
12328
|
|
|
@@ -12115,9 +12392,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12115
12392
|
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
|
12116
12393
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
12117
12394
|
|
|
12118
|
-
ggml_tensor * wkv_state =
|
|
12119
|
-
gf,
|
|
12120
|
-
hparams.
|
|
12395
|
+
ggml_tensor * wkv_state = build_rs(
|
|
12396
|
+
inp, gf, mctx_cur->get_s_l(il),
|
|
12397
|
+
hparams.n_embd_s(), n_seqs);
|
|
12121
12398
|
|
|
12122
12399
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
12123
12400
|
cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
|
|
@@ -12130,9 +12407,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12130
12407
|
wkv_state,
|
|
12131
12408
|
ggml_view_1d(
|
|
12132
12409
|
ctx0,
|
|
12133
|
-
|
|
12134
|
-
hparams.
|
|
12135
|
-
hparams.
|
|
12410
|
+
mctx_cur->get_s_l(il),
|
|
12411
|
+
hparams.n_embd_s() * n_seqs,
|
|
12412
|
+
hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
|
|
12136
12413
|
)
|
|
12137
12414
|
)
|
|
12138
12415
|
);
|
|
@@ -12173,20 +12450,19 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12173
12450
|
inpL = build_inp_embd(model.tok_embd);
|
|
12174
12451
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
|
12175
12452
|
|
|
12176
|
-
|
|
12177
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12453
|
+
auto * rs_inp = build_rs_inp();
|
|
12178
12454
|
|
|
12179
12455
|
const auto n_embd = hparams.n_embd;
|
|
12180
12456
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12181
12457
|
const auto n_seqs = ubatch.n_seqs;
|
|
12182
12458
|
|
|
12459
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12460
|
+
|
|
12183
12461
|
for (int il = 0; il < n_layer; ++il) {
|
|
12184
12462
|
const llama_layer * layer = &model.layers[il];
|
|
12185
12463
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12186
12464
|
|
|
12187
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12188
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12189
|
-
);
|
|
12465
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12190
12466
|
|
|
12191
12467
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
12192
12468
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -12201,7 +12477,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12201
12477
|
1
|
|
12202
12478
|
);
|
|
12203
12479
|
|
|
12204
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12480
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12205
12481
|
|
|
12206
12482
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12207
12483
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -12223,12 +12499,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12223
12499
|
);
|
|
12224
12500
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
12225
12501
|
|
|
12226
|
-
|
|
12227
|
-
|
|
12228
|
-
|
|
12229
|
-
|
|
12230
|
-
|
|
12231
|
-
|
|
12502
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12503
|
+
ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
|
|
12504
|
+
x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
|
|
12505
|
+
|
|
12506
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12507
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12508
|
+
ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
|
|
12509
|
+
x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
|
|
12232
12510
|
}
|
|
12233
12511
|
|
|
12234
12512
|
cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
|
|
@@ -12259,7 +12537,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
12259
12537
|
|
|
12260
12538
|
struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
12261
12539
|
llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
|
|
12262
|
-
GGML_ASSERT(n_embd == hparams.
|
|
12540
|
+
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
12263
12541
|
|
|
12264
12542
|
ggml_tensor * cur;
|
|
12265
12543
|
ggml_tensor * inpL;
|
|
@@ -12267,20 +12545,19 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12267
12545
|
|
|
12268
12546
|
inpL = build_inp_embd(model.tok_embd);
|
|
12269
12547
|
|
|
12270
|
-
|
|
12271
|
-
ggml_tensor * state_mask = build_inp_s_mask();
|
|
12548
|
+
auto * rs_inp = build_rs_inp();
|
|
12272
12549
|
|
|
12273
12550
|
const auto n_embd = hparams.n_embd;
|
|
12274
12551
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12275
12552
|
const auto n_seqs = ubatch.n_seqs;
|
|
12276
12553
|
|
|
12554
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12555
|
+
|
|
12277
12556
|
for (int il = 0; il < n_layer; ++il) {
|
|
12278
12557
|
const llama_layer * layer = &model.layers[il];
|
|
12279
12558
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
12280
12559
|
|
|
12281
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
|
12282
|
-
gf, state_copy, state_mask, ubatch, il
|
|
12283
|
-
);
|
|
12560
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
|
|
12284
12561
|
|
|
12285
12562
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
12286
12563
|
cb(att_norm, "attn_norm", il);
|
|
@@ -12292,7 +12569,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12292
12569
|
1
|
|
12293
12570
|
);
|
|
12294
12571
|
|
|
12295
|
-
cur = build_rwkv7_time_mix(gf, att_norm, x_prev,
|
|
12572
|
+
cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
|
|
12296
12573
|
|
|
12297
12574
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
12298
12575
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -12300,11 +12577,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
12300
12577
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
12301
12578
|
cb(ffn_inp, "ffn_inp", il);
|
|
12302
12579
|
|
|
12303
|
-
|
|
12304
|
-
|
|
12305
|
-
|
|
12306
|
-
|
|
12307
|
-
|
|
12580
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
12581
|
+
ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
|
|
12582
|
+
|
|
12583
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12584
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12585
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
12308
12586
|
}
|
|
12309
12587
|
|
|
12310
12588
|
// feed-forward network
|
|
@@ -12373,6 +12651,9 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12373
12651
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12374
12652
|
|
|
12375
12653
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12654
|
+
|
|
12655
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12656
|
+
|
|
12376
12657
|
for (int il = 0; il < n_layer; ++il) {
|
|
12377
12658
|
ggml_tensor * inpSA = inpL;
|
|
12378
12659
|
|
|
@@ -12435,9 +12716,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
12435
12716
|
cb(cur, "attn_out", il);
|
|
12436
12717
|
}
|
|
12437
12718
|
|
|
12438
|
-
if (il == n_layer - 1) {
|
|
12439
|
-
// skip computing output for unused tokens
|
|
12440
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12719
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12441
12720
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12442
12721
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12443
12722
|
}
|
|
@@ -12556,6 +12835,8 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
12556
12835
|
|
|
12557
12836
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12558
12837
|
|
|
12838
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12839
|
+
|
|
12559
12840
|
for (int il = 0; il < n_layer; ++il) {
|
|
12560
12841
|
ggml_tensor * inpSA = inpL;
|
|
12561
12842
|
|
|
@@ -12632,21 +12913,19 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
12632
12913
|
cur = build_attn(inp_attn, gf,
|
|
12633
12914
|
model.layers[il].wo, nullptr,
|
|
12634
12915
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12635
|
-
|
|
12636
|
-
if (hparams.swin_norm) {
|
|
12637
|
-
cur = build_norm(cur,
|
|
12638
|
-
model.layers[il].attn_norm, NULL,
|
|
12639
|
-
LLM_NORM_RMS, il);
|
|
12640
|
-
}
|
|
12641
12916
|
}
|
|
12642
12917
|
|
|
12643
|
-
if (il == n_layer - 1) {
|
|
12644
|
-
// skip computing output for unused tokens
|
|
12645
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12918
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12646
12919
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12647
12920
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12648
12921
|
}
|
|
12649
12922
|
|
|
12923
|
+
if (hparams.swin_norm) {
|
|
12924
|
+
cur = build_norm(cur,
|
|
12925
|
+
model.layers[il].attn_norm, NULL,
|
|
12926
|
+
LLM_NORM_RMS, il);
|
|
12927
|
+
}
|
|
12928
|
+
|
|
12650
12929
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12651
12930
|
cb(ffn_inp, "ffn_inp", il);
|
|
12652
12931
|
|
|
@@ -12887,6 +13166,8 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
12887
13166
|
|
|
12888
13167
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12889
13168
|
|
|
13169
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13170
|
+
|
|
12890
13171
|
for (int il = 0; il < n_layer; ++il) {
|
|
12891
13172
|
ggml_tensor * inpSA = inpL;
|
|
12892
13173
|
|
|
@@ -12990,9 +13271,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
12990
13271
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
12991
13272
|
}
|
|
12992
13273
|
|
|
12993
|
-
if (il == n_layer - 1) {
|
|
12994
|
-
// skip computing output for unused tokens
|
|
12995
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13274
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12996
13275
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12997
13276
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12998
13277
|
}
|
|
@@ -13052,6 +13331,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13052
13331
|
|
|
13053
13332
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13054
13333
|
|
|
13334
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13335
|
+
|
|
13055
13336
|
for (int il = 0; il < n_layer; ++il) {
|
|
13056
13337
|
ggml_tensor * inpSA = inpL;
|
|
13057
13338
|
|
|
@@ -13113,9 +13394,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13113
13394
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
13114
13395
|
}
|
|
13115
13396
|
|
|
13116
|
-
if (il == n_layer - 1) {
|
|
13117
|
-
// skip computing output for unused tokens
|
|
13118
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13397
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13119
13398
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13120
13399
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13121
13400
|
}
|
|
@@ -13184,69 +13463,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
13184
13463
|
}
|
|
13185
13464
|
};
|
|
13186
13465
|
|
|
13466
|
+
struct llm_build_dots1 : public llm_graph_context {
|
|
13467
|
+
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13468
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13469
|
+
|
|
13470
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13471
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13472
|
+
|
|
13473
|
+
ggml_tensor * cur;
|
|
13474
|
+
ggml_tensor * inpL;
|
|
13475
|
+
|
|
13476
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13477
|
+
|
|
13478
|
+
// inp_pos - contains the positions
|
|
13479
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13480
|
+
|
|
13481
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13482
|
+
|
|
13483
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13484
|
+
|
|
13485
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13486
|
+
ggml_tensor * inpSA = inpL;
|
|
13487
|
+
|
|
13488
|
+
// norm
|
|
13489
|
+
cur = build_norm(inpL,
|
|
13490
|
+
model.layers[il].attn_norm, NULL,
|
|
13491
|
+
LLM_NORM_RMS, il);
|
|
13492
|
+
cb(cur, "attn_norm", il);
|
|
13493
|
+
|
|
13494
|
+
// self_attention
|
|
13495
|
+
{
|
|
13496
|
+
// compute Q and K and RoPE them
|
|
13497
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13498
|
+
cb(Qcur, "Qcur", il);
|
|
13499
|
+
|
|
13500
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13501
|
+
cb(Kcur, "Kcur", il);
|
|
13502
|
+
|
|
13503
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13504
|
+
cb(Vcur, "Vcur", il);
|
|
13505
|
+
|
|
13506
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13507
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13508
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13509
|
+
|
|
13510
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13511
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13512
|
+
|
|
13513
|
+
Qcur = ggml_rope_ext(
|
|
13514
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
13515
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13516
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13517
|
+
);
|
|
13518
|
+
|
|
13519
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13520
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13521
|
+
|
|
13522
|
+
Kcur = ggml_rope_ext(
|
|
13523
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
13524
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13525
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13526
|
+
);
|
|
13527
|
+
|
|
13528
|
+
cb(Qcur, "Qcur", il);
|
|
13529
|
+
cb(Kcur, "Kcur", il);
|
|
13530
|
+
cb(Vcur, "Vcur", il);
|
|
13531
|
+
|
|
13532
|
+
cur = build_attn(inp_attn, gf,
|
|
13533
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13534
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13535
|
+
}
|
|
13536
|
+
|
|
13537
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13538
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13539
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13540
|
+
}
|
|
13541
|
+
|
|
13542
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13543
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13544
|
+
|
|
13545
|
+
// MoE branch
|
|
13546
|
+
cur = build_norm(ffn_inp,
|
|
13547
|
+
model.layers[il].ffn_norm, NULL,
|
|
13548
|
+
LLM_NORM_RMS, il);
|
|
13549
|
+
cb(cur, "ffn_norm", il);
|
|
13550
|
+
|
|
13551
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
13552
|
+
cur = build_ffn(cur,
|
|
13553
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13554
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13555
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13556
|
+
NULL,
|
|
13557
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13558
|
+
cb(cur, "ffn_out", il);
|
|
13559
|
+
} else {
|
|
13560
|
+
ggml_tensor * moe_out =
|
|
13561
|
+
build_moe_ffn(cur,
|
|
13562
|
+
model.layers[il].ffn_gate_inp,
|
|
13563
|
+
model.layers[il].ffn_up_exps,
|
|
13564
|
+
model.layers[il].ffn_gate_exps,
|
|
13565
|
+
model.layers[il].ffn_down_exps,
|
|
13566
|
+
model.layers[il].ffn_exp_probs_b,
|
|
13567
|
+
n_expert, n_expert_used,
|
|
13568
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
13569
|
+
true, hparams.expert_weights_scale,
|
|
13570
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
13571
|
+
il);
|
|
13572
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
13573
|
+
|
|
13574
|
+
{
|
|
13575
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
13576
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
13577
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
13578
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
13579
|
+
NULL,
|
|
13580
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13581
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
13582
|
+
|
|
13583
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
13584
|
+
cb(cur, "ffn_out", il);
|
|
13585
|
+
}
|
|
13586
|
+
}
|
|
13587
|
+
|
|
13588
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13589
|
+
|
|
13590
|
+
cur = build_cvec(cur, il);
|
|
13591
|
+
cb(cur, "l_out", il);
|
|
13592
|
+
|
|
13593
|
+
// input for next layer
|
|
13594
|
+
inpL = cur;
|
|
13595
|
+
}
|
|
13596
|
+
|
|
13597
|
+
cur = inpL;
|
|
13598
|
+
|
|
13599
|
+
cur = build_norm(cur,
|
|
13600
|
+
model.output_norm, NULL,
|
|
13601
|
+
LLM_NORM_RMS, -1);
|
|
13602
|
+
|
|
13603
|
+
cb(cur, "result_norm", -1);
|
|
13604
|
+
res->t_embd = cur;
|
|
13605
|
+
|
|
13606
|
+
// lm_head
|
|
13607
|
+
cur = build_lora_mm(model.output, cur);
|
|
13608
|
+
|
|
13609
|
+
cb(cur, "result_output", -1);
|
|
13610
|
+
res->t_logits = cur;
|
|
13611
|
+
|
|
13612
|
+
ggml_build_forward_expand(gf, cur);
|
|
13613
|
+
}
|
|
13614
|
+
};
|
|
13615
|
+
|
|
13616
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
13617
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
13618
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13619
|
+
|
|
13620
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13621
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13622
|
+
|
|
13623
|
+
ggml_tensor * cur;
|
|
13624
|
+
ggml_tensor * inpL;
|
|
13625
|
+
|
|
13626
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13627
|
+
|
|
13628
|
+
// inp_pos - contains the positions
|
|
13629
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
13630
|
+
|
|
13631
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13632
|
+
|
|
13633
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13634
|
+
|
|
13635
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13636
|
+
|
|
13637
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13638
|
+
ggml_tensor * inpSA = inpL;
|
|
13639
|
+
|
|
13640
|
+
// norm
|
|
13641
|
+
cur = build_norm(inpL,
|
|
13642
|
+
model.layers[il].attn_norm, NULL,
|
|
13643
|
+
LLM_NORM_RMS, il);
|
|
13644
|
+
cb(cur, "attn_norm", il);
|
|
13645
|
+
|
|
13646
|
+
// self-attention
|
|
13647
|
+
{
|
|
13648
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
13649
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
13650
|
+
|
|
13651
|
+
// compute Q and K and RoPE them
|
|
13652
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13653
|
+
cb(Qcur, "Qcur", il);
|
|
13654
|
+
if (model.layers[il].bq) {
|
|
13655
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13656
|
+
cb(Qcur, "Qcur", il);
|
|
13657
|
+
}
|
|
13658
|
+
|
|
13659
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13660
|
+
cb(Kcur, "Kcur", il);
|
|
13661
|
+
if (model.layers[il].bk) {
|
|
13662
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13663
|
+
cb(Kcur, "Kcur", il);
|
|
13664
|
+
}
|
|
13665
|
+
|
|
13666
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13667
|
+
cb(Vcur, "Vcur", il);
|
|
13668
|
+
if (model.layers[il].bv) {
|
|
13669
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13670
|
+
cb(Vcur, "Vcur", il);
|
|
13671
|
+
}
|
|
13672
|
+
|
|
13673
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13674
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13675
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13676
|
+
|
|
13677
|
+
Qcur = ggml_rope_ext(
|
|
13678
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
13679
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13680
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13681
|
+
);
|
|
13682
|
+
|
|
13683
|
+
Kcur = ggml_rope_ext(
|
|
13684
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
13685
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13686
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13687
|
+
);
|
|
13688
|
+
|
|
13689
|
+
cb(Qcur, "Qcur", il);
|
|
13690
|
+
cb(Kcur, "Kcur", il);
|
|
13691
|
+
cb(Vcur, "Vcur", il);
|
|
13692
|
+
|
|
13693
|
+
cur = build_attn(inp_attn, gf,
|
|
13694
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13695
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
13696
|
+
cb(cur, "attn_out", il);
|
|
13697
|
+
}
|
|
13698
|
+
|
|
13699
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13700
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13701
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13702
|
+
}
|
|
13703
|
+
|
|
13704
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13705
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13706
|
+
|
|
13707
|
+
// feed-forward network
|
|
13708
|
+
// ARCEE uses relu^2 instead of silu
|
|
13709
|
+
cur = build_norm(ffn_inp,
|
|
13710
|
+
model.layers[il].ffn_norm, NULL,
|
|
13711
|
+
LLM_NORM_RMS, il);
|
|
13712
|
+
cb(cur, "ffn_norm", il);
|
|
13713
|
+
|
|
13714
|
+
cur = build_ffn(cur,
|
|
13715
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13716
|
+
NULL, NULL, NULL,
|
|
13717
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13718
|
+
NULL,
|
|
13719
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
13720
|
+
cb(cur, "ffn_out", il);
|
|
13721
|
+
|
|
13722
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13723
|
+
cb(cur, "ffn_out", il);
|
|
13724
|
+
|
|
13725
|
+
cur = build_cvec(cur, il);
|
|
13726
|
+
cb(cur, "l_out", il);
|
|
13727
|
+
|
|
13728
|
+
// input for next layer
|
|
13729
|
+
inpL = cur;
|
|
13730
|
+
}
|
|
13731
|
+
|
|
13732
|
+
cur = inpL;
|
|
13733
|
+
|
|
13734
|
+
cur = build_norm(cur,
|
|
13735
|
+
model.output_norm, NULL,
|
|
13736
|
+
LLM_NORM_RMS, -1);
|
|
13737
|
+
|
|
13738
|
+
cb(cur, "result_norm", -1);
|
|
13739
|
+
res->t_embd = cur;
|
|
13740
|
+
|
|
13741
|
+
// lm_head
|
|
13742
|
+
cur = build_lora_mm(model.output, cur);
|
|
13743
|
+
|
|
13744
|
+
cb(cur, "result_output", -1);
|
|
13745
|
+
res->t_logits = cur;
|
|
13746
|
+
|
|
13747
|
+
ggml_build_forward_expand(gf, cur);
|
|
13748
|
+
}
|
|
13749
|
+
};
|
|
13750
|
+
|
|
13187
13751
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
13188
13752
|
llama_memory_i * res;
|
|
13189
13753
|
|
|
13190
13754
|
switch (arch) {
|
|
13755
|
+
// Models that need specific instantiation should be handled in the
|
|
13756
|
+
// switch statement
|
|
13191
13757
|
case LLM_ARCH_BERT:
|
|
13192
13758
|
case LLM_ARCH_JINA_BERT_V2:
|
|
13193
13759
|
case LLM_ARCH_NOMIC_BERT:
|
|
13194
13760
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
13761
|
+
case LLM_ARCH_NEO_BERT:
|
|
13195
13762
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
13196
13763
|
{
|
|
13197
13764
|
res = nullptr;
|
|
13198
13765
|
} break;
|
|
13199
|
-
|
|
13200
|
-
|
|
13201
|
-
case LLM_ARCH_RWKV6QWEN2:
|
|
13202
|
-
case LLM_ARCH_RWKV7:
|
|
13203
|
-
case LLM_ARCH_ARWKV7:
|
|
13204
|
-
{
|
|
13205
|
-
res = new llama_kv_cache_recurrent(
|
|
13206
|
-
*this,
|
|
13207
|
-
GGML_TYPE_F32,
|
|
13208
|
-
GGML_TYPE_F32,
|
|
13209
|
-
cparams.offload_kqv,
|
|
13210
|
-
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13211
|
-
cparams.n_seq_max);
|
|
13212
|
-
} break;
|
|
13766
|
+
// Models that need standard caching should rely on recurrent/hybrid
|
|
13767
|
+
// checks
|
|
13213
13768
|
default:
|
|
13214
13769
|
{
|
|
13215
|
-
|
|
13216
|
-
|
|
13217
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13218
|
-
|
|
13219
|
-
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13220
|
-
|
|
13221
|
-
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13222
|
-
GGML_ASSERT(hparams.is_swa_any());
|
|
13223
|
-
|
|
13224
|
-
res = new llama_kv_cache_unified_iswa(
|
|
13225
|
-
*this,
|
|
13226
|
-
params.type_k,
|
|
13227
|
-
params.type_v,
|
|
13228
|
-
!cparams.flash_attn,
|
|
13229
|
-
cparams.offload_kqv,
|
|
13230
|
-
params.swa_full,
|
|
13231
|
-
cparams.n_ctx,
|
|
13232
|
-
cparams.n_seq_max,
|
|
13233
|
-
cparams.n_batch,
|
|
13234
|
-
padding);
|
|
13235
|
-
} else {
|
|
13236
|
-
GGML_ASSERT(!hparams.is_swa_any());
|
|
13237
|
-
|
|
13238
|
-
res = new llama_kv_cache_unified(
|
|
13770
|
+
if (llm_arch_is_recurrent(arch)) {
|
|
13771
|
+
res = new llama_memory_recurrent(
|
|
13239
13772
|
*this,
|
|
13240
13773
|
nullptr,
|
|
13241
|
-
|
|
13242
|
-
|
|
13243
|
-
!cparams.flash_attn,
|
|
13774
|
+
GGML_TYPE_F32,
|
|
13775
|
+
GGML_TYPE_F32,
|
|
13244
13776
|
cparams.offload_kqv,
|
|
13245
|
-
cparams.
|
|
13246
|
-
cparams.n_seq_max
|
|
13247
|
-
|
|
13248
|
-
|
|
13249
|
-
|
|
13777
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13778
|
+
cparams.n_seq_max);
|
|
13779
|
+
} else if (llm_arch_is_hybrid(arch)) {
|
|
13780
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13781
|
+
|
|
13782
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13783
|
+
|
|
13784
|
+
res = new llama_memory_hybrid(
|
|
13785
|
+
/* model */ *this,
|
|
13786
|
+
/* attn_type_k */ params.type_k,
|
|
13787
|
+
/* attn_type_v */ params.type_v,
|
|
13788
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
13789
|
+
/* attn_kv_size */ cparams.n_ctx,
|
|
13790
|
+
/* attn_n_pad */ padding,
|
|
13791
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
13792
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
13793
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
13794
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
13795
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
13796
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
13797
|
+
/* offload */ cparams.offload_kqv);
|
|
13798
|
+
} else {
|
|
13799
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
13800
|
+
|
|
13801
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
13802
|
+
|
|
13803
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
13804
|
+
|
|
13805
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
13806
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
13807
|
+
|
|
13808
|
+
res = new llama_kv_cache_unified_iswa(
|
|
13809
|
+
*this,
|
|
13810
|
+
params.type_k,
|
|
13811
|
+
params.type_v,
|
|
13812
|
+
!cparams.flash_attn,
|
|
13813
|
+
cparams.offload_kqv,
|
|
13814
|
+
params.swa_full,
|
|
13815
|
+
cparams.n_ctx,
|
|
13816
|
+
cparams.n_seq_max,
|
|
13817
|
+
cparams.n_ubatch,
|
|
13818
|
+
padding);
|
|
13819
|
+
} else {
|
|
13820
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
13821
|
+
|
|
13822
|
+
res = new llama_kv_cache_unified(
|
|
13823
|
+
*this,
|
|
13824
|
+
nullptr,
|
|
13825
|
+
params.type_k,
|
|
13826
|
+
params.type_v,
|
|
13827
|
+
!cparams.flash_attn,
|
|
13828
|
+
cparams.offload_kqv,
|
|
13829
|
+
cparams.n_ctx,
|
|
13830
|
+
cparams.n_seq_max,
|
|
13831
|
+
padding,
|
|
13832
|
+
hparams.n_swa,
|
|
13833
|
+
hparams.swa_type);
|
|
13834
|
+
}
|
|
13250
13835
|
}
|
|
13251
13836
|
}
|
|
13252
13837
|
}
|
|
@@ -13262,7 +13847,6 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13262
13847
|
|
|
13263
13848
|
switch (arch) {
|
|
13264
13849
|
case LLM_ARCH_LLAMA:
|
|
13265
|
-
case LLM_ARCH_MINICPM:
|
|
13266
13850
|
{
|
|
13267
13851
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
|
13268
13852
|
} break;
|
|
@@ -13301,6 +13885,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13301
13885
|
{
|
|
13302
13886
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
13303
13887
|
} break;
|
|
13888
|
+
case LLM_ARCH_NEO_BERT:
|
|
13889
|
+
{
|
|
13890
|
+
llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
|
|
13891
|
+
} break;
|
|
13304
13892
|
case LLM_ARCH_BLOOM:
|
|
13305
13893
|
{
|
|
13306
13894
|
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
|
@@ -13503,6 +14091,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13503
14091
|
} break;
|
|
13504
14092
|
case LLM_ARCH_GRANITE:
|
|
13505
14093
|
case LLM_ARCH_GRANITE_MOE:
|
|
14094
|
+
case LLM_ARCH_MINICPM:
|
|
13506
14095
|
{
|
|
13507
14096
|
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
|
13508
14097
|
} break;
|
|
@@ -13522,6 +14111,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
13522
14111
|
{
|
|
13523
14112
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13524
14113
|
} break;
|
|
14114
|
+
case LLM_ARCH_DOTS1:
|
|
14115
|
+
{
|
|
14116
|
+
llm = std::make_unique<llm_build_dots1>(*this, params, gf);
|
|
14117
|
+
} break;
|
|
14118
|
+
case LLM_ARCH_ARCEE:
|
|
14119
|
+
{
|
|
14120
|
+
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14121
|
+
} break;
|
|
13525
14122
|
default:
|
|
13526
14123
|
GGML_ABORT("fatal error");
|
|
13527
14124
|
}
|
|
@@ -13593,6 +14190,22 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
|
|
|
13593
14190
|
return model->hparams.n_head_kv();
|
|
13594
14191
|
}
|
|
13595
14192
|
|
|
14193
|
+
int32_t llama_model_n_swa(const llama_model * model) {
|
|
14194
|
+
return model->hparams.n_swa;
|
|
14195
|
+
}
|
|
14196
|
+
|
|
14197
|
+
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
|
|
14198
|
+
return model->hparams.n_cls_out;
|
|
14199
|
+
}
|
|
14200
|
+
|
|
14201
|
+
const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
|
|
14202
|
+
if (i < model->classifier_labels.size()) {
|
|
14203
|
+
return model->classifier_labels[i].c_str();
|
|
14204
|
+
}
|
|
14205
|
+
|
|
14206
|
+
return nullptr;
|
|
14207
|
+
}
|
|
14208
|
+
|
|
13596
14209
|
// deprecated
|
|
13597
14210
|
int32_t llama_n_ctx_train(const llama_model * model) {
|
|
13598
14211
|
return llama_model_n_ctx_train(model);
|
|
@@ -13655,6 +14268,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13655
14268
|
case LLM_ARCH_GRANITE_MOE:
|
|
13656
14269
|
case LLM_ARCH_CHAMELEON:
|
|
13657
14270
|
case LLM_ARCH_BAILINGMOE:
|
|
14271
|
+
case LLM_ARCH_NEO_BERT:
|
|
14272
|
+
case LLM_ARCH_ARCEE:
|
|
13658
14273
|
return LLAMA_ROPE_TYPE_NORM;
|
|
13659
14274
|
|
|
13660
14275
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -13688,6 +14303,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
13688
14303
|
case LLM_ARCH_NEMOTRON:
|
|
13689
14304
|
case LLM_ARCH_EXAONE:
|
|
13690
14305
|
case LLM_ARCH_MINICPM3:
|
|
14306
|
+
case LLM_ARCH_DOTS1:
|
|
13691
14307
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
13692
14308
|
|
|
13693
14309
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -13753,7 +14369,7 @@ uint64_t llama_model_size(const llama_model * model) {
|
|
|
13753
14369
|
}
|
|
13754
14370
|
|
|
13755
14371
|
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
|
13756
|
-
const auto key = name ? LLM_KV(model->arch, name)(
|
|
14372
|
+
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
|
13757
14373
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
|
13758
14374
|
const auto & it = model->gguf_kv.find(key);
|
|
13759
14375
|
if (it == model->gguf_kv.end()) {
|
|
@@ -13795,14 +14411,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
|
|
|
13795
14411
|
}
|
|
13796
14412
|
|
|
13797
14413
|
bool llama_model_is_recurrent(const llama_model * model) {
|
|
13798
|
-
|
|
13799
|
-
case LLM_ARCH_MAMBA: return true;
|
|
13800
|
-
case LLM_ARCH_RWKV6: return true;
|
|
13801
|
-
case LLM_ARCH_RWKV6QWEN2: return true;
|
|
13802
|
-
case LLM_ARCH_RWKV7: return true;
|
|
13803
|
-
case LLM_ARCH_ARWKV7: return true;
|
|
13804
|
-
default: return false;
|
|
13805
|
-
}
|
|
14414
|
+
return llm_arch_is_recurrent(model->arch);
|
|
13806
14415
|
}
|
|
13807
14416
|
|
|
13808
14417
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|