cui-llama.rn 1.0.7 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +1 -1
- package/cpp/common.cpp +67 -34
- package/cpp/common.h +23 -8
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +55 -22
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +40 -15
- package/cpp/ggml.h +10 -6
- package/cpp/grammar-parser.cpp +3 -0
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +14 -18
- package/cpp/llama-vocab.h +4 -2
- package/cpp/llama.cpp +466 -280
- package/cpp/llama.h +10 -11
- package/cpp/rn-llama.hpp +23 -10
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -132,20 +132,6 @@ static std::string trim(const std::string & str) {
|
|
132
132
|
return str.substr(start, end - start);
|
133
133
|
}
|
134
134
|
|
135
|
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
136
|
-
std::string result;
|
137
|
-
for (size_t pos = 0; ; pos += search.length()) {
|
138
|
-
auto new_pos = s.find(search, pos);
|
139
|
-
if (new_pos == std::string::npos) {
|
140
|
-
result += s.substr(pos, s.size() - pos);
|
141
|
-
break;
|
142
|
-
}
|
143
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
144
|
-
pos = new_pos;
|
145
|
-
}
|
146
|
-
s = std::move(result);
|
147
|
-
}
|
148
|
-
|
149
135
|
static bool is_float_close(float a, float b, float abs_tol) {
|
150
136
|
// Check for non-negative tolerance
|
151
137
|
if (abs_tol < 0.0) {
|
@@ -233,6 +219,7 @@ enum llm_arch {
|
|
233
219
|
LLM_ARCH_CHATGLM,
|
234
220
|
LLM_ARCH_BITNET,
|
235
221
|
LLM_ARCH_T5,
|
222
|
+
LLM_ARCH_T5ENCODER,
|
236
223
|
LLM_ARCH_JAIS,
|
237
224
|
LLM_ARCH_UNKNOWN,
|
238
225
|
};
|
@@ -277,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
277
264
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
278
265
|
{ LLM_ARCH_BITNET, "bitnet" },
|
279
266
|
{ LLM_ARCH_T5, "t5" },
|
267
|
+
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
280
268
|
{ LLM_ARCH_JAIS, "jais" },
|
281
269
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
282
270
|
};
|
@@ -373,6 +361,7 @@ enum llm_kv {
|
|
373
361
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
374
362
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
375
363
|
LLM_KV_TOKENIZER_EOT_ID,
|
364
|
+
LLM_KV_TOKENIZER_EOM_ID,
|
376
365
|
|
377
366
|
LLM_KV_ADAPTER_TYPE,
|
378
367
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
@@ -470,6 +459,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
470
459
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
471
460
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
472
461
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
462
|
+
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
473
463
|
|
474
464
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
475
465
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
@@ -1284,6 +1274,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1284
1274
|
{ LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
|
1285
1275
|
},
|
1286
1276
|
},
|
1277
|
+
{
|
1278
|
+
LLM_ARCH_T5ENCODER,
|
1279
|
+
{
|
1280
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1281
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1282
|
+
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
1283
|
+
{ LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
|
1284
|
+
{ LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
|
1285
|
+
{ LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
|
1286
|
+
{ LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
|
1287
|
+
{ LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
|
1288
|
+
{ LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
|
1289
|
+
{ LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
|
1290
|
+
{ LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
|
1291
|
+
{ LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
|
1292
|
+
{ LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
|
1293
|
+
},
|
1294
|
+
},
|
1287
1295
|
{
|
1288
1296
|
LLM_ARCH_JAIS,
|
1289
1297
|
{
|
@@ -3578,13 +3586,8 @@ namespace GGUFMeta {
|
|
3578
3586
|
|
3579
3587
|
using llama_buf_map = std::unordered_map<uint32_t, lm_ggml_backend_buffer_t>;
|
3580
3588
|
|
3581
|
-
|
3582
|
-
|
3583
|
-
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
|
3584
|
-
// return 32768;
|
3585
|
-
//}
|
3586
|
-
|
3587
|
-
return 8192;
|
3589
|
+
static size_t llama_model_max_nodes(const llama_model & model) {
|
3590
|
+
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
|
3588
3591
|
}
|
3589
3592
|
|
3590
3593
|
struct llama_model_loader {
|
@@ -4904,7 +4907,6 @@ static void llm_load_hparams(
|
|
4904
4907
|
} break;
|
4905
4908
|
case LLM_ARCH_PHI3:
|
4906
4909
|
{
|
4907
|
-
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
4908
4910
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4909
4911
|
|
4910
4912
|
switch (hparams.n_layer) {
|
@@ -4913,6 +4915,22 @@ static void llm_load_hparams(
|
|
4913
4915
|
case 40: model.type = e_model::MODEL_14B; break;
|
4914
4916
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4915
4917
|
}
|
4918
|
+
|
4919
|
+
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
4920
|
+
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
4921
|
+
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
4922
|
+
hparams.n_swa = 2047;
|
4923
|
+
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
4924
|
+
// default value for Phi-3-mini-128k-instruct
|
4925
|
+
hparams.n_swa = 262144;
|
4926
|
+
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
4927
|
+
// default value for Phi-3-medium-128k-instruct
|
4928
|
+
hparams.n_swa = 131072;
|
4929
|
+
}
|
4930
|
+
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
4931
|
+
if (!found_swa && hparams.n_swa == 0) {
|
4932
|
+
throw std::runtime_error("invalid value for sliding_window");
|
4933
|
+
}
|
4916
4934
|
} break;
|
4917
4935
|
case LLM_ARCH_PLAMO:
|
4918
4936
|
{
|
@@ -5210,6 +5228,12 @@ static void llm_load_hparams(
|
|
5210
5228
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5211
5229
|
}
|
5212
5230
|
} break;
|
5231
|
+
case LLM_ARCH_T5ENCODER:
|
5232
|
+
{
|
5233
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5234
|
+
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
5235
|
+
model.type = e_model::MODEL_UNKNOWN;
|
5236
|
+
} break;
|
5213
5237
|
case LLM_ARCH_JAIS:
|
5214
5238
|
{
|
5215
5239
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -5454,6 +5478,12 @@ static void llm_load_vocab(
|
|
5454
5478
|
} else if (
|
5455
5479
|
tokenizer_pre == "codeshell") {
|
5456
5480
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
5481
|
+
} else if (
|
5482
|
+
tokenizer_pre == "bloom") {
|
5483
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
5484
|
+
} else if (
|
5485
|
+
tokenizer_pre == "gpt3-finnish") {
|
5486
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
5457
5487
|
} else {
|
5458
5488
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
5459
5489
|
}
|
@@ -5597,6 +5627,7 @@ static void llm_load_vocab(
|
|
5597
5627
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
5598
5628
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
5599
5629
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
5630
|
+
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
5600
5631
|
};
|
5601
5632
|
|
5602
5633
|
for (const auto & it : special_token_types) {
|
@@ -5649,6 +5680,17 @@ static void llm_load_vocab(
|
|
5649
5680
|
}
|
5650
5681
|
}
|
5651
5682
|
}
|
5683
|
+
|
5684
|
+
// find EOM token: "<|eom_id|>"
|
5685
|
+
//
|
5686
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
|
5687
|
+
// for now, we apply this workaround to find the EOM token based on its text
|
5688
|
+
if (vocab.special_eom_id == -1) {
|
5689
|
+
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
5690
|
+
if (t != vocab.token_to_id.end()) {
|
5691
|
+
vocab.special_eom_id = t->second;
|
5692
|
+
}
|
5693
|
+
}
|
5652
5694
|
}
|
5653
5695
|
|
5654
5696
|
// build special tokens cache
|
@@ -7432,6 +7474,42 @@ static bool llm_load_tensors(
|
|
7432
7474
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
|
7433
7475
|
}
|
7434
7476
|
} break;
|
7477
|
+
case LLM_ARCH_T5ENCODER:
|
7478
|
+
{
|
7479
|
+
const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
|
7480
|
+
|
7481
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
7482
|
+
|
7483
|
+
// output
|
7484
|
+
{
|
7485
|
+
model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
|
7486
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7487
|
+
// if output is NULL, init from the input tok embed
|
7488
|
+
if (model.output == NULL) {
|
7489
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
7490
|
+
}
|
7491
|
+
}
|
7492
|
+
|
7493
|
+
for (int i = 0; i < n_layer; ++i) {
|
7494
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7495
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7496
|
+
|
7497
|
+
auto & layer = model.layers[i];
|
7498
|
+
|
7499
|
+
layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
|
7500
|
+
layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7501
|
+
|
7502
|
+
layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
7503
|
+
layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
7504
|
+
layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
7505
|
+
layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
|
7506
|
+
|
7507
|
+
layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
|
7508
|
+
layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7509
|
+
layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7510
|
+
layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
|
7511
|
+
}
|
7512
|
+
} break;
|
7435
7513
|
case LLM_ARCH_JAIS:
|
7436
7514
|
{
|
7437
7515
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -13146,7 +13224,7 @@ struct llm_build_context {
|
|
13146
13224
|
return gf;
|
13147
13225
|
}
|
13148
13226
|
|
13149
|
-
struct lm_ggml_cgraph *
|
13227
|
+
struct lm_ggml_cgraph * build_t5_encoder() {
|
13150
13228
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13151
13229
|
|
13152
13230
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -13161,303 +13239,323 @@ struct llm_build_context {
|
|
13161
13239
|
|
13162
13240
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13163
13241
|
|
13164
|
-
|
13165
|
-
|
13242
|
+
LM_GGML_ASSERT(lctx.is_encoding);
|
13243
|
+
struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
|
13166
13244
|
|
13167
|
-
|
13168
|
-
|
13169
|
-
|
13170
|
-
for (int il = 0; il < n_layer; ++il) {
|
13171
|
-
struct lm_ggml_tensor * inpSA = inpL;
|
13245
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13246
|
+
struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
|
13172
13247
|
|
13173
|
-
|
13174
|
-
|
13175
|
-
model.layers[il].attn_norm_enc, NULL,
|
13176
|
-
LLM_NORM_RMS, cb, il);
|
13177
|
-
cb(cur, "attn_norm", il);
|
13248
|
+
for (int il = 0; il < n_layer; ++il) {
|
13249
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13178
13250
|
|
13179
|
-
|
13180
|
-
|
13181
|
-
|
13182
|
-
|
13251
|
+
// norm
|
13252
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13253
|
+
model.layers[il].attn_norm_enc, NULL,
|
13254
|
+
LLM_NORM_RMS, cb, il);
|
13255
|
+
cb(cur, "attn_norm", il);
|
13183
13256
|
|
13184
|
-
|
13185
|
-
|
13257
|
+
// self-attention
|
13258
|
+
{
|
13259
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
|
13260
|
+
cb(Qcur, "Qcur", il);
|
13186
13261
|
|
13187
|
-
|
13188
|
-
|
13262
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
|
13263
|
+
cb(Kcur, "Kcur", il);
|
13189
13264
|
|
13190
|
-
|
13191
|
-
|
13265
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
|
13266
|
+
cb(Vcur, "Vcur", il);
|
13192
13267
|
|
13193
|
-
|
13194
|
-
|
13268
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13269
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
13195
13270
|
|
13196
|
-
|
13197
|
-
|
13271
|
+
struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
13272
|
+
struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
13198
13273
|
|
13199
|
-
|
13200
|
-
|
13201
|
-
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13202
|
-
cb(kq_b, "kq_b", il);
|
13274
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
13275
|
+
cb(kq, "kq", il);
|
13203
13276
|
|
13204
|
-
|
13205
|
-
|
13277
|
+
struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
|
13278
|
+
struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
|
13279
|
+
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13280
|
+
cb(kq_b, "kq_b", il);
|
13206
13281
|
|
13207
|
-
|
13208
|
-
|
13282
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
|
13283
|
+
cb(kq, "kq_soft_max_ext", il);
|
13209
13284
|
|
13210
|
-
|
13211
|
-
|
13285
|
+
struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
13286
|
+
cb(v, "v", il);
|
13212
13287
|
|
13213
|
-
|
13214
|
-
|
13288
|
+
struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
13289
|
+
cb(kqv, "kqv", il);
|
13215
13290
|
|
13216
|
-
|
13217
|
-
|
13291
|
+
struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
13292
|
+
cb(kqv_merged, "kqv_merged", il);
|
13218
13293
|
|
13219
|
-
|
13294
|
+
cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
13295
|
+
cb(cur, "kqv_merged_cont", il);
|
13220
13296
|
|
13221
|
-
|
13222
|
-
cb(cur, "kqv_out", il);
|
13223
|
-
}
|
13297
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13224
13298
|
|
13225
|
-
|
13226
|
-
|
13227
|
-
|
13228
|
-
n_tokens = n_outputs;
|
13229
|
-
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13230
|
-
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13231
|
-
}
|
13299
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
|
13300
|
+
cb(cur, "kqv_out", il);
|
13301
|
+
}
|
13232
13302
|
|
13233
|
-
|
13234
|
-
|
13303
|
+
if (il == n_layer - 1) {
|
13304
|
+
// skip computing output for unused tokens
|
13305
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13306
|
+
n_tokens = n_outputs;
|
13307
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13308
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13309
|
+
}
|
13235
13310
|
|
13236
|
-
|
13237
|
-
|
13238
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13239
|
-
model.layers[il].ffn_norm_enc, NULL,
|
13240
|
-
LLM_NORM_RMS, cb, il);
|
13241
|
-
cb(cur, "ffn_norm", il);
|
13311
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13312
|
+
cb(ffn_inp, "ffn_inp", il);
|
13242
13313
|
|
13243
|
-
|
13244
|
-
|
13245
|
-
|
13246
|
-
|
13247
|
-
|
13248
|
-
|
13249
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13250
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13251
|
-
cb, il);
|
13252
|
-
cb(cur, "ffn_out", il);
|
13253
|
-
}
|
13314
|
+
// feed-forward network
|
13315
|
+
{
|
13316
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13317
|
+
model.layers[il].ffn_norm_enc, NULL,
|
13318
|
+
LLM_NORM_RMS, cb, il);
|
13319
|
+
cb(cur, "ffn_norm", il);
|
13254
13320
|
|
13255
|
-
|
13321
|
+
// T5 uses relu, flan-T5 uses gelu-gated
|
13322
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13323
|
+
model.layers[il].ffn_up_enc, NULL, NULL,
|
13324
|
+
model.layers[il].ffn_gate_enc, NULL, NULL,
|
13325
|
+
model.layers[il].ffn_down_enc, NULL, NULL,
|
13326
|
+
NULL,
|
13327
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13328
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13329
|
+
cb, il);
|
13256
13330
|
cb(cur, "ffn_out", il);
|
13331
|
+
}
|
13257
13332
|
|
13258
|
-
|
13259
|
-
|
13260
|
-
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13261
|
-
}
|
13262
|
-
cb(cur, "l_out", il);
|
13333
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13334
|
+
cb(cur, "ffn_out", il);
|
13263
13335
|
|
13264
|
-
|
13265
|
-
|
13336
|
+
lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
13337
|
+
if (layer_dir != nullptr) {
|
13338
|
+
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13266
13339
|
}
|
13340
|
+
cb(cur, "l_out", il);
|
13267
13341
|
|
13268
|
-
|
13269
|
-
|
13342
|
+
// input for next layer
|
13343
|
+
inpL = cur;
|
13344
|
+
}
|
13270
13345
|
|
13271
|
-
|
13272
|
-
|
13273
|
-
LLM_NORM_RMS, cb, -1);
|
13274
|
-
cb(cur, "result_norm", -1);
|
13275
|
-
} else {
|
13276
|
-
LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
|
13346
|
+
cur = inpL;
|
13347
|
+
cb(cur, "result_embd", -1);
|
13277
13348
|
|
13278
|
-
|
13279
|
-
|
13349
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13350
|
+
model.output_norm_enc, NULL,
|
13351
|
+
LLM_NORM_RMS, cb, -1);
|
13352
|
+
cb(cur, "result_norm", -1);
|
13280
13353
|
|
13281
|
-
|
13282
|
-
struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
|
13354
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13283
13355
|
|
13284
|
-
|
13285
|
-
|
13356
|
+
return gf;
|
13357
|
+
}
|
13286
13358
|
|
13287
|
-
|
13288
|
-
|
13289
|
-
model.layers[il].attn_norm, NULL,
|
13290
|
-
LLM_NORM_RMS, cb, il);
|
13291
|
-
cb(cur, "attn_norm", il);
|
13359
|
+
struct lm_ggml_cgraph * build_t5_decoder() {
|
13360
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13292
13361
|
|
13293
|
-
|
13294
|
-
|
13295
|
-
struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
13296
|
-
cb(Qcur, "Qcur", il);
|
13362
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
13363
|
+
int32_t n_tokens = this->n_tokens;
|
13297
13364
|
|
13298
|
-
|
13299
|
-
|
13365
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13366
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
13367
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13300
13368
|
|
13301
|
-
|
13302
|
-
|
13369
|
+
struct lm_ggml_tensor * cur;
|
13370
|
+
struct lm_ggml_tensor * inpL;
|
13303
13371
|
|
13304
|
-
|
13372
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13305
13373
|
|
13306
|
-
|
13307
|
-
|
13308
|
-
n_embd_head_k, n_kv, n_head_kv,
|
13309
|
-
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
13310
|
-
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
13311
|
-
0);
|
13312
|
-
cb(k, "k", il);
|
13374
|
+
LM_GGML_ASSERT(!lctx.is_encoding);
|
13375
|
+
LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
|
13313
13376
|
|
13314
|
-
|
13315
|
-
|
13316
|
-
n_kv, n_embd_head_v, n_head_kv,
|
13317
|
-
lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
|
13318
|
-
lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
|
13319
|
-
0);
|
13320
|
-
cb(v, "v", il);
|
13377
|
+
struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc();
|
13378
|
+
struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
|
13321
13379
|
|
13322
|
-
|
13380
|
+
struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
|
13381
|
+
struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
|
13382
|
+
|
13383
|
+
for (int il = 0; il < n_layer; ++il) {
|
13384
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13323
13385
|
|
13324
|
-
|
13386
|
+
// norm
|
13387
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13388
|
+
model.layers[il].attn_norm, NULL,
|
13389
|
+
LLM_NORM_RMS, cb, il);
|
13390
|
+
cb(cur, "attn_norm", il);
|
13325
13391
|
|
13326
|
-
|
13327
|
-
|
13392
|
+
// self-attention
|
13393
|
+
{
|
13394
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13395
|
+
cb(Qcur, "Qcur", il);
|
13328
13396
|
|
13329
|
-
|
13330
|
-
|
13331
|
-
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13332
|
-
cb(kq_b, "kq_b", il);
|
13397
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13398
|
+
cb(Kcur, "Kcur", il);
|
13333
13399
|
|
13334
|
-
|
13335
|
-
|
13400
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13401
|
+
cb(Vcur, "Vcur", il);
|
13336
13402
|
|
13337
|
-
|
13338
|
-
cb(kqv, "kqv", il);
|
13403
|
+
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
13339
13404
|
|
13340
|
-
|
13341
|
-
|
13405
|
+
struct lm_ggml_tensor * k =
|
13406
|
+
lm_ggml_view_3d(ctx0, kv_self.k_l[il],
|
13407
|
+
n_embd_head_k, n_kv, n_head_kv,
|
13408
|
+
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
13409
|
+
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
13410
|
+
0);
|
13411
|
+
cb(k, "k", il);
|
13342
13412
|
|
13343
|
-
|
13344
|
-
|
13413
|
+
struct lm_ggml_tensor * v =
|
13414
|
+
lm_ggml_view_3d(ctx0, kv_self.v_l[il],
|
13415
|
+
n_kv, n_embd_head_v, n_head_kv,
|
13416
|
+
lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
|
13417
|
+
lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
|
13418
|
+
0);
|
13419
|
+
cb(v, "v", il);
|
13345
13420
|
|
13346
|
-
|
13421
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13347
13422
|
|
13348
|
-
|
13349
|
-
cb(cur, "kqv_out", il);
|
13350
|
-
}
|
13423
|
+
struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
13351
13424
|
|
13352
|
-
|
13353
|
-
cb(
|
13425
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
13426
|
+
cb(kq, "kq", il);
|
13354
13427
|
|
13355
|
-
struct lm_ggml_tensor *
|
13428
|
+
struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
|
13429
|
+
struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
|
13430
|
+
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13431
|
+
cb(kq_b, "kq_b", il);
|
13356
13432
|
|
13357
|
-
|
13358
|
-
|
13359
|
-
model.layers[il].attn_norm_cross, NULL,
|
13360
|
-
LLM_NORM_RMS, cb, il);
|
13361
|
-
cb(cur, "attn_norm_cross", il);
|
13433
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
|
13434
|
+
cb(kq, "kq_soft_max_ext", il);
|
13362
13435
|
|
13363
|
-
|
13364
|
-
|
13365
|
-
struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
|
13366
|
-
cb(Qcur, "Qcur", il);
|
13436
|
+
struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
|
13437
|
+
cb(kqv, "kqv", il);
|
13367
13438
|
|
13368
|
-
|
13369
|
-
|
13439
|
+
struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
13440
|
+
cb(kqv_merged, "kqv_merged", il);
|
13370
13441
|
|
13371
|
-
|
13372
|
-
|
13442
|
+
cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
13443
|
+
cb(cur, "kqv_merged_cont", il);
|
13373
13444
|
|
13374
|
-
|
13375
|
-
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
|
13445
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13376
13446
|
|
13377
|
-
|
13378
|
-
|
13447
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
13448
|
+
cb(cur, "kqv_out", il);
|
13449
|
+
}
|
13379
13450
|
|
13380
|
-
|
13381
|
-
|
13451
|
+
cur = lm_ggml_add(ctx0, cur, inpSA);
|
13452
|
+
cb(cur, "cross_inp", il);
|
13382
13453
|
|
13383
|
-
|
13384
|
-
cb(kq, "kq_soft_max_ext", il);
|
13454
|
+
struct lm_ggml_tensor * inpCA = cur;
|
13385
13455
|
|
13386
|
-
|
13387
|
-
|
13456
|
+
// norm
|
13457
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13458
|
+
model.layers[il].attn_norm_cross, NULL,
|
13459
|
+
LLM_NORM_RMS, cb, il);
|
13460
|
+
cb(cur, "attn_norm_cross", il);
|
13388
13461
|
|
13389
|
-
|
13390
|
-
|
13462
|
+
// cross-attention
|
13463
|
+
{
|
13464
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
|
13465
|
+
cb(Qcur, "Qcur", il);
|
13391
13466
|
|
13392
|
-
|
13393
|
-
|
13467
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
|
13468
|
+
cb(Kcur, "Kcur", il);
|
13394
13469
|
|
13395
|
-
|
13396
|
-
|
13470
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
|
13471
|
+
cb(Vcur, "Vcur", il);
|
13397
13472
|
|
13398
|
-
|
13473
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13474
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
|
13399
13475
|
|
13400
|
-
|
13401
|
-
|
13402
|
-
}
|
13476
|
+
struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
13477
|
+
struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
13403
13478
|
|
13404
|
-
|
13405
|
-
|
13406
|
-
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13407
|
-
n_tokens = n_outputs;
|
13408
|
-
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13409
|
-
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13410
|
-
inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
13411
|
-
}
|
13479
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
13480
|
+
cb(kq, "kq", il);
|
13412
13481
|
|
13413
|
-
|
13414
|
-
cb(
|
13482
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
|
13483
|
+
cb(kq, "kq_soft_max_ext", il);
|
13415
13484
|
|
13416
|
-
|
13417
|
-
|
13418
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13419
|
-
model.layers[il].ffn_norm, NULL,
|
13420
|
-
LLM_NORM_RMS, cb, il);
|
13421
|
-
cb(cur, "ffn_norm", il);
|
13485
|
+
struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
|
13486
|
+
cb(v, "v", il);
|
13422
13487
|
|
13423
|
-
|
13424
|
-
|
13425
|
-
model.layers[il].ffn_up, NULL, NULL,
|
13426
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
13427
|
-
model.layers[il].ffn_down, NULL, NULL,
|
13428
|
-
NULL,
|
13429
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13430
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13431
|
-
cb, il);
|
13432
|
-
cb(cur, "ffn_out", il);
|
13433
|
-
}
|
13488
|
+
struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
|
13489
|
+
cb(kqv, "kqv", il);
|
13434
13490
|
|
13435
|
-
|
13436
|
-
cb(
|
13491
|
+
struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
13492
|
+
cb(kqv_merged, "kqv_merged", il);
|
13437
13493
|
|
13438
|
-
|
13439
|
-
|
13440
|
-
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13441
|
-
}
|
13442
|
-
cb(cur, "l_out", il);
|
13494
|
+
cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
13495
|
+
cb(cur, "kqv_merged_cont", il);
|
13443
13496
|
|
13444
|
-
|
13445
|
-
|
13497
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13498
|
+
|
13499
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
|
13500
|
+
cb(cur, "kqv_out", il);
|
13446
13501
|
}
|
13447
13502
|
|
13448
|
-
|
13449
|
-
|
13503
|
+
if (il == n_layer - 1) {
|
13504
|
+
// skip computing output for unused tokens
|
13505
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13506
|
+
n_tokens = n_outputs;
|
13507
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13508
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13509
|
+
inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
13510
|
+
}
|
13450
13511
|
|
13451
|
-
|
13452
|
-
|
13453
|
-
LLM_NORM_RMS, cb, -1);
|
13454
|
-
cb(cur, "result_norm", -1);
|
13512
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA);
|
13513
|
+
cb(ffn_inp, "ffn_inp", il);
|
13455
13514
|
|
13456
|
-
//
|
13457
|
-
|
13458
|
-
|
13515
|
+
// feed-forward network
|
13516
|
+
{
|
13517
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13518
|
+
model.layers[il].ffn_norm, NULL,
|
13519
|
+
LLM_NORM_RMS, cb, il);
|
13520
|
+
cb(cur, "ffn_norm", il);
|
13521
|
+
|
13522
|
+
// T5 uses relu, flan-T5 uses gelu-gated
|
13523
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13524
|
+
model.layers[il].ffn_up, NULL, NULL,
|
13525
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
13526
|
+
model.layers[il].ffn_down, NULL, NULL,
|
13527
|
+
NULL,
|
13528
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13529
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13530
|
+
cb, il);
|
13531
|
+
cb(cur, "ffn_out", il);
|
13532
|
+
}
|
13533
|
+
|
13534
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13535
|
+
cb(cur, "ffn_out", il);
|
13536
|
+
|
13537
|
+
lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
13538
|
+
if (layer_dir != nullptr) {
|
13539
|
+
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13540
|
+
}
|
13541
|
+
cb(cur, "l_out", il);
|
13542
|
+
|
13543
|
+
// input for next layer
|
13544
|
+
inpL = cur;
|
13459
13545
|
}
|
13460
13546
|
|
13547
|
+
cur = inpL;
|
13548
|
+
cb(cur, "result_embd", -1);
|
13549
|
+
|
13550
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13551
|
+
model.output_norm, NULL,
|
13552
|
+
LLM_NORM_RMS, cb, -1);
|
13553
|
+
cb(cur, "result_norm", -1);
|
13554
|
+
|
13555
|
+
// lm_head
|
13556
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13557
|
+
cb(cur, "result_output", -1);
|
13558
|
+
|
13461
13559
|
lm_ggml_build_forward_expand(gf, cur);
|
13462
13560
|
|
13463
13561
|
return gf;
|
@@ -13909,7 +14007,15 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
13909
14007
|
} break;
|
13910
14008
|
case LLM_ARCH_T5:
|
13911
14009
|
{
|
13912
|
-
|
14010
|
+
if (lctx.is_encoding) {
|
14011
|
+
result = llm.build_t5_encoder();
|
14012
|
+
} else {
|
14013
|
+
result = llm.build_t5_decoder();
|
14014
|
+
}
|
14015
|
+
} break;
|
14016
|
+
case LLM_ARCH_T5ENCODER:
|
14017
|
+
{
|
14018
|
+
result = llm.build_t5_encoder();
|
13913
14019
|
} break;
|
13914
14020
|
case LLM_ARCH_JAIS:
|
13915
14021
|
{
|
@@ -14357,7 +14463,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
14357
14463
|
|
14358
14464
|
// TODO: use a per-batch flag for logits presence instead
|
14359
14465
|
const bool has_logits = !cparams.embeddings;
|
14360
|
-
const bool has_embd =
|
14466
|
+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
14361
14467
|
|
14362
14468
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
14363
14469
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
@@ -14628,12 +14734,15 @@ static int llama_decode_internal(
|
|
14628
14734
|
res = nullptr;
|
14629
14735
|
embd = nullptr;
|
14630
14736
|
} else if (cparams.embeddings) {
|
14631
|
-
res
|
14632
|
-
embd =
|
14633
|
-
|
14634
|
-
|
14737
|
+
res = nullptr; // do not extract logits for embedding case
|
14738
|
+
embd = nullptr;
|
14739
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
14740
|
+
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
14741
|
+
embd = gf->nodes[i];
|
14742
|
+
break;
|
14743
|
+
}
|
14635
14744
|
}
|
14636
|
-
LM_GGML_ASSERT(
|
14745
|
+
LM_GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
|
14637
14746
|
} else {
|
14638
14747
|
embd = nullptr; // do not extract embeddings when not needed
|
14639
14748
|
LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
@@ -14840,9 +14949,24 @@ static int llama_encode_internal(
|
|
14840
14949
|
lm_ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
14841
14950
|
|
14842
14951
|
// the output embeddings after the final encoder normalization
|
14843
|
-
struct lm_ggml_tensor * embd =
|
14952
|
+
struct lm_ggml_tensor * embd = nullptr;
|
14844
14953
|
|
14845
|
-
|
14954
|
+
// there are two cases here
|
14955
|
+
if (llama_model_has_decoder(&lctx.model)) {
|
14956
|
+
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
14957
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
14958
|
+
LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
14959
|
+
} else {
|
14960
|
+
// second case is an encoder-only T5 model
|
14961
|
+
if (cparams.embeddings) {
|
14962
|
+
// only output embeddings if required
|
14963
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
14964
|
+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
14965
|
+
embd = gf->nodes[gf->n_nodes - 2];
|
14966
|
+
}
|
14967
|
+
LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
14968
|
+
}
|
14969
|
+
}
|
14846
14970
|
|
14847
14971
|
lm_ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
14848
14972
|
|
@@ -14855,20 +14979,54 @@ static int llama_encode_internal(
|
|
14855
14979
|
lm_ggml_backend_t backend_embd = lm_ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
14856
14980
|
LM_GGML_ASSERT(backend_embd != nullptr);
|
14857
14981
|
|
14858
|
-
|
14859
|
-
|
14982
|
+
if (llama_model_has_decoder(&lctx.model)) {
|
14983
|
+
lctx.embd_enc.resize(n_tokens*n_embd);
|
14984
|
+
float * embd_out = lctx.embd_enc.data();
|
14860
14985
|
|
14861
|
-
|
14862
|
-
float * embd_out = lctx.embd_enc.data();
|
14986
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
|
14863
14987
|
|
14864
|
-
|
14988
|
+
// remember the sequence ids used during the encoding - needed for cross attention later
|
14989
|
+
lctx.seq_ids_enc.resize(n_tokens);
|
14990
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
14991
|
+
for (int s = 0; s < batch.n_seq_id[i]; s++) {
|
14992
|
+
llama_seq_id seq_id = batch.seq_id[i][s];
|
14993
|
+
lctx.seq_ids_enc[i].insert(seq_id);
|
14994
|
+
}
|
14995
|
+
}
|
14996
|
+
} else {
|
14997
|
+
LM_GGML_ASSERT(lctx.embd != nullptr);
|
14865
14998
|
|
14866
|
-
|
14867
|
-
|
14868
|
-
|
14869
|
-
|
14870
|
-
|
14871
|
-
|
14999
|
+
switch (cparams.pooling_type) {
|
15000
|
+
case LLAMA_POOLING_TYPE_NONE:
|
15001
|
+
{
|
15002
|
+
// extract token embeddings
|
15003
|
+
LM_GGML_ASSERT(lctx.embd != nullptr);
|
15004
|
+
float * embd_out = lctx.embd;
|
15005
|
+
|
15006
|
+
LM_GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
|
15007
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
|
15008
|
+
} break;
|
15009
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
15010
|
+
case LLAMA_POOLING_TYPE_CLS:
|
15011
|
+
case LLAMA_POOLING_TYPE_LAST:
|
15012
|
+
{
|
15013
|
+
// extract sequence embeddings
|
15014
|
+
auto & embd_seq_out = lctx.embd_seq;
|
15015
|
+
embd_seq_out.clear();
|
15016
|
+
|
15017
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
15018
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
15019
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
15020
|
+
continue;
|
15021
|
+
}
|
15022
|
+
embd_seq_out[seq_id].resize(n_embd);
|
15023
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
15024
|
+
}
|
15025
|
+
} break;
|
15026
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
15027
|
+
{
|
15028
|
+
LM_GGML_ABORT("unknown pooling type");
|
15029
|
+
}
|
14872
15030
|
}
|
14873
15031
|
}
|
14874
15032
|
}
|
@@ -15304,7 +15462,7 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
15304
15462
|
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
15305
15463
|
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
15306
15464
|
if (n_expert > 1) {
|
15307
|
-
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but
|
15465
|
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
15308
15466
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
15309
15467
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
15310
15468
|
// tensor name.
|
@@ -16578,6 +16736,8 @@ struct llama_context * llama_new_context_with_model(
|
|
16578
16736
|
|
16579
16737
|
ctx->sampling.rng = std::mt19937(params.seed);
|
16580
16738
|
ctx->logits_all = params.logits_all;
|
16739
|
+
// build worst-case graph for encoder if a model contains encoder
|
16740
|
+
ctx->is_encoding = llama_model_has_encoder(model);
|
16581
16741
|
|
16582
16742
|
uint32_t kv_size = cparams.n_ctx;
|
16583
16743
|
lm_ggml_type type_k = params.type_k;
|
@@ -16892,6 +17052,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16892
17052
|
case LLM_ARCH_MAMBA:
|
16893
17053
|
case LLM_ARCH_JINA_BERT_V2:
|
16894
17054
|
case LLM_ARCH_T5:
|
17055
|
+
case LLM_ARCH_T5ENCODER:
|
16895
17056
|
case LLM_ARCH_JAIS:
|
16896
17057
|
return LLAMA_ROPE_TYPE_NONE;
|
16897
17058
|
|
@@ -17039,8 +17200,16 @@ struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const
|
|
17039
17200
|
|
17040
17201
|
bool llama_model_has_encoder(const struct llama_model * model) {
|
17041
17202
|
switch (model->arch) {
|
17042
|
-
case LLM_ARCH_T5:
|
17043
|
-
|
17203
|
+
case LLM_ARCH_T5: return true;
|
17204
|
+
case LLM_ARCH_T5ENCODER: return true;
|
17205
|
+
default: return false;
|
17206
|
+
}
|
17207
|
+
}
|
17208
|
+
|
17209
|
+
bool llama_model_has_decoder(const struct llama_model * model) {
|
17210
|
+
switch (model->arch) {
|
17211
|
+
case LLM_ARCH_T5ENCODER: return false;
|
17212
|
+
default: return true;
|
17044
17213
|
}
|
17045
17214
|
}
|
17046
17215
|
|
@@ -17343,6 +17512,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
17343
17512
|
// TODO: replace all non-fatal assertions with returned errors or exceptions
|
17344
17513
|
struct llama_data_write {
|
17345
17514
|
virtual void write(const void * src, size_t size) = 0;
|
17515
|
+
virtual void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) = 0;
|
17346
17516
|
virtual size_t get_size_written() = 0;
|
17347
17517
|
virtual ~llama_data_write() = default;
|
17348
17518
|
|
@@ -17465,9 +17635,8 @@ struct llama_data_write {
|
|
17465
17635
|
// Read each range of cells of k_size length each into tmp_buf and write out
|
17466
17636
|
for (const auto & range : cell_ranges) {
|
17467
17637
|
const size_t range_size = range.second - range.first;
|
17468
|
-
|
17469
|
-
|
17470
|
-
write(tmp_buf.data(), tmp_buf.size());
|
17638
|
+
const size_t buf_size = range_size * k_size_row;
|
17639
|
+
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
|
17471
17640
|
}
|
17472
17641
|
}
|
17473
17642
|
|
@@ -17486,9 +17655,8 @@ struct llama_data_write {
|
|
17486
17655
|
// Read each range of cells of v_size length each into tmp_buf and write out
|
17487
17656
|
for (const auto & range : cell_ranges) {
|
17488
17657
|
const size_t range_size = range.second - range.first;
|
17489
|
-
|
17490
|
-
|
17491
|
-
write(tmp_buf.data(), tmp_buf.size());
|
17658
|
+
const size_t buf_size = range_size * v_size_row;
|
17659
|
+
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
|
17492
17660
|
}
|
17493
17661
|
}
|
17494
17662
|
} else {
|
@@ -17514,9 +17682,8 @@ struct llama_data_write {
|
|
17514
17682
|
for (const auto & range : cell_ranges) {
|
17515
17683
|
const size_t range_size = range.second - range.first;
|
17516
17684
|
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
17517
|
-
|
17518
|
-
|
17519
|
-
write(tmp_buf.data(), tmp_buf.size());
|
17685
|
+
const size_t buf_size = range_size * v_size_el;
|
17686
|
+
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
|
17520
17687
|
}
|
17521
17688
|
}
|
17522
17689
|
}
|
@@ -17875,12 +18042,14 @@ struct llama_data_write_dummy : llama_data_write {
|
|
17875
18042
|
|
17876
18043
|
llama_data_write_dummy() {}
|
17877
18044
|
|
17878
|
-
// TODO: avoid unnecessary calls to lm_ggml_backend_tensor_get in a dummy context
|
17879
|
-
|
17880
18045
|
void write(const void * /* src */, size_t size) override {
|
17881
18046
|
size_written += size;
|
17882
18047
|
}
|
17883
18048
|
|
18049
|
+
void write_tensor_data(const struct lm_ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
|
18050
|
+
size_written += size;
|
18051
|
+
}
|
18052
|
+
|
17884
18053
|
size_t get_size_written() override {
|
17885
18054
|
return size_written;
|
17886
18055
|
}
|
@@ -17903,6 +18072,16 @@ struct llama_data_write_buffer : llama_data_write {
|
|
17903
18072
|
buf_size -= size;
|
17904
18073
|
}
|
17905
18074
|
|
18075
|
+
void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
|
18076
|
+
if (size > buf_size) {
|
18077
|
+
throw std::runtime_error("unexpectedly reached end of buffer");
|
18078
|
+
}
|
18079
|
+
lm_ggml_backend_tensor_get(tensor, ptr, offset, size);
|
18080
|
+
ptr += size;
|
18081
|
+
size_written += size;
|
18082
|
+
buf_size -= size;
|
18083
|
+
}
|
18084
|
+
|
17906
18085
|
size_t get_size_written() override {
|
17907
18086
|
return size_written;
|
17908
18087
|
}
|
@@ -17938,6 +18117,7 @@ struct llama_data_read_buffer : llama_data_read {
|
|
17938
18117
|
struct llama_data_write_file : llama_data_write {
|
17939
18118
|
llama_file * file;
|
17940
18119
|
size_t size_written = 0;
|
18120
|
+
std::vector<uint8_t> temp_buffer;
|
17941
18121
|
|
17942
18122
|
llama_data_write_file(llama_file * f) : file(f) {}
|
17943
18123
|
|
@@ -17946,6 +18126,12 @@ struct llama_data_write_file : llama_data_write {
|
|
17946
18126
|
size_written += size;
|
17947
18127
|
}
|
17948
18128
|
|
18129
|
+
void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
|
18130
|
+
temp_buffer.resize(size);
|
18131
|
+
lm_ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
|
18132
|
+
write(temp_buffer.data(), temp_buffer.size());
|
18133
|
+
}
|
18134
|
+
|
17949
18135
|
size_t get_size_written() override {
|
17950
18136
|
return size_written;
|
17951
18137
|
}
|
@@ -18530,11 +18716,11 @@ llama_token llama_token_pad(const struct llama_model * model) {
|
|
18530
18716
|
return llama_token_pad_impl(model->vocab);
|
18531
18717
|
}
|
18532
18718
|
|
18533
|
-
|
18719
|
+
bool llama_add_bos_token(const struct llama_model * model) {
|
18534
18720
|
return llama_add_bos_token_impl(model->vocab);
|
18535
18721
|
}
|
18536
18722
|
|
18537
|
-
|
18723
|
+
bool llama_add_eos_token(const struct llama_model * model) {
|
18538
18724
|
return llama_add_eos_token_impl(model->vocab);
|
18539
18725
|
}
|
18540
18726
|
|