cui-llama.rn 1.0.6 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/android/src/main/jni.cpp +2 -2
- package/cpp/common.cpp +68 -29
- package/cpp/common.h +23 -4
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +54 -21
- package/cpp/ggml-quants.c +8 -8
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +81 -12
- package/cpp/ggml.h +6 -4
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-vocab.cpp +10 -16
- package/cpp/llama-vocab.h +2 -0
- package/cpp/llama.cpp +434 -265
- package/cpp/llama.h +4 -1
- package/cpp/rn-llama.hpp +7 -6
- package/ios/RNLlamaContext.mm +1 -1
- package/jest/mock.js +3 -0
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -132,20 +132,6 @@ static std::string trim(const std::string & str) {
|
|
132
132
|
return str.substr(start, end - start);
|
133
133
|
}
|
134
134
|
|
135
|
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
136
|
-
std::string result;
|
137
|
-
for (size_t pos = 0; ; pos += search.length()) {
|
138
|
-
auto new_pos = s.find(search, pos);
|
139
|
-
if (new_pos == std::string::npos) {
|
140
|
-
result += s.substr(pos, s.size() - pos);
|
141
|
-
break;
|
142
|
-
}
|
143
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
144
|
-
pos = new_pos;
|
145
|
-
}
|
146
|
-
s = std::move(result);
|
147
|
-
}
|
148
|
-
|
149
135
|
static bool is_float_close(float a, float b, float abs_tol) {
|
150
136
|
// Check for non-negative tolerance
|
151
137
|
if (abs_tol < 0.0) {
|
@@ -233,6 +219,7 @@ enum llm_arch {
|
|
233
219
|
LLM_ARCH_CHATGLM,
|
234
220
|
LLM_ARCH_BITNET,
|
235
221
|
LLM_ARCH_T5,
|
222
|
+
LLM_ARCH_T5ENCODER,
|
236
223
|
LLM_ARCH_JAIS,
|
237
224
|
LLM_ARCH_UNKNOWN,
|
238
225
|
};
|
@@ -277,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
277
264
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
278
265
|
{ LLM_ARCH_BITNET, "bitnet" },
|
279
266
|
{ LLM_ARCH_T5, "t5" },
|
267
|
+
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
280
268
|
{ LLM_ARCH_JAIS, "jais" },
|
281
269
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
282
270
|
};
|
@@ -373,6 +361,7 @@ enum llm_kv {
|
|
373
361
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
374
362
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
375
363
|
LLM_KV_TOKENIZER_EOT_ID,
|
364
|
+
LLM_KV_TOKENIZER_EOM_ID,
|
376
365
|
|
377
366
|
LLM_KV_ADAPTER_TYPE,
|
378
367
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
@@ -470,6 +459,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
470
459
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
471
460
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
472
461
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
462
|
+
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
473
463
|
|
474
464
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
475
465
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
@@ -1284,6 +1274,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1284
1274
|
{ LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
|
1285
1275
|
},
|
1286
1276
|
},
|
1277
|
+
{
|
1278
|
+
LLM_ARCH_T5ENCODER,
|
1279
|
+
{
|
1280
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1281
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1282
|
+
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
1283
|
+
{ LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
|
1284
|
+
{ LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
|
1285
|
+
{ LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
|
1286
|
+
{ LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
|
1287
|
+
{ LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
|
1288
|
+
{ LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
|
1289
|
+
{ LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
|
1290
|
+
{ LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
|
1291
|
+
{ LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
|
1292
|
+
{ LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
|
1293
|
+
},
|
1294
|
+
},
|
1287
1295
|
{
|
1288
1296
|
LLM_ARCH_JAIS,
|
1289
1297
|
{
|
@@ -4980,6 +4988,7 @@ static void llm_load_hparams(
|
|
4980
4988
|
hparams.attn_soft_cap = true;
|
4981
4989
|
|
4982
4990
|
switch (hparams.n_layer) {
|
4991
|
+
case 26: model.type = e_model::MODEL_2B; break;
|
4983
4992
|
case 42: model.type = e_model::MODEL_9B; break;
|
4984
4993
|
case 46: model.type = e_model::MODEL_27B; break;
|
4985
4994
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -5209,6 +5218,12 @@ static void llm_load_hparams(
|
|
5209
5218
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5210
5219
|
}
|
5211
5220
|
} break;
|
5221
|
+
case LLM_ARCH_T5ENCODER:
|
5222
|
+
{
|
5223
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5224
|
+
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
5225
|
+
model.type = e_model::MODEL_UNKNOWN;
|
5226
|
+
} break;
|
5212
5227
|
case LLM_ARCH_JAIS:
|
5213
5228
|
{
|
5214
5229
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -5596,6 +5611,7 @@ static void llm_load_vocab(
|
|
5596
5611
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
5597
5612
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
5598
5613
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
5614
|
+
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
5599
5615
|
};
|
5600
5616
|
|
5601
5617
|
for (const auto & it : special_token_types) {
|
@@ -5648,6 +5664,17 @@ static void llm_load_vocab(
|
|
5648
5664
|
}
|
5649
5665
|
}
|
5650
5666
|
}
|
5667
|
+
|
5668
|
+
// find EOM token: "<|eom_id|>"
|
5669
|
+
//
|
5670
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
|
5671
|
+
// for now, we apply this workaround to find the EOM token based on its text
|
5672
|
+
if (vocab.special_eom_id == -1) {
|
5673
|
+
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
5674
|
+
if (t != vocab.token_to_id.end()) {
|
5675
|
+
vocab.special_eom_id = t->second;
|
5676
|
+
}
|
5677
|
+
}
|
5651
5678
|
}
|
5652
5679
|
|
5653
5680
|
// build special tokens cache
|
@@ -7431,6 +7458,42 @@ static bool llm_load_tensors(
|
|
7431
7458
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
|
7432
7459
|
}
|
7433
7460
|
} break;
|
7461
|
+
case LLM_ARCH_T5ENCODER:
|
7462
|
+
{
|
7463
|
+
const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
|
7464
|
+
|
7465
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
7466
|
+
|
7467
|
+
// output
|
7468
|
+
{
|
7469
|
+
model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
|
7470
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7471
|
+
// if output is NULL, init from the input tok embed
|
7472
|
+
if (model.output == NULL) {
|
7473
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
7474
|
+
}
|
7475
|
+
}
|
7476
|
+
|
7477
|
+
for (int i = 0; i < n_layer; ++i) {
|
7478
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7479
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7480
|
+
|
7481
|
+
auto & layer = model.layers[i];
|
7482
|
+
|
7483
|
+
layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
|
7484
|
+
layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7485
|
+
|
7486
|
+
layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
7487
|
+
layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
7488
|
+
layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
7489
|
+
layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
|
7490
|
+
|
7491
|
+
layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
|
7492
|
+
layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7493
|
+
layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7494
|
+
layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
|
7495
|
+
}
|
7496
|
+
} break;
|
7434
7497
|
case LLM_ARCH_JAIS:
|
7435
7498
|
{
|
7436
7499
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -11747,6 +11810,7 @@ struct llm_build_context {
|
|
11747
11810
|
|
11748
11811
|
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
11749
11812
|
switch (model.type) {
|
11813
|
+
case e_model::MODEL_2B:
|
11750
11814
|
case e_model::MODEL_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
11751
11815
|
case e_model::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
11752
11816
|
default: LM_GGML_ABORT("fatal error");
|
@@ -13144,7 +13208,7 @@ struct llm_build_context {
|
|
13144
13208
|
return gf;
|
13145
13209
|
}
|
13146
13210
|
|
13147
|
-
struct lm_ggml_cgraph *
|
13211
|
+
struct lm_ggml_cgraph * build_t5_encoder() {
|
13148
13212
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13149
13213
|
|
13150
13214
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -13159,303 +13223,323 @@ struct llm_build_context {
|
|
13159
13223
|
|
13160
13224
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13161
13225
|
|
13162
|
-
|
13163
|
-
|
13164
|
-
|
13165
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13166
|
-
struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
|
13226
|
+
LM_GGML_ASSERT(lctx.is_encoding);
|
13227
|
+
struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
|
13167
13228
|
|
13168
|
-
|
13169
|
-
|
13229
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13230
|
+
struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
|
13170
13231
|
|
13171
|
-
|
13172
|
-
|
13173
|
-
model.layers[il].attn_norm_enc, NULL,
|
13174
|
-
LLM_NORM_RMS, cb, il);
|
13175
|
-
cb(cur, "attn_norm", il);
|
13232
|
+
for (int il = 0; il < n_layer; ++il) {
|
13233
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13176
13234
|
|
13177
|
-
|
13178
|
-
|
13179
|
-
|
13180
|
-
|
13235
|
+
// norm
|
13236
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13237
|
+
model.layers[il].attn_norm_enc, NULL,
|
13238
|
+
LLM_NORM_RMS, cb, il);
|
13239
|
+
cb(cur, "attn_norm", il);
|
13181
13240
|
|
13182
|
-
|
13183
|
-
|
13241
|
+
// self-attention
|
13242
|
+
{
|
13243
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
|
13244
|
+
cb(Qcur, "Qcur", il);
|
13184
13245
|
|
13185
|
-
|
13186
|
-
|
13246
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
|
13247
|
+
cb(Kcur, "Kcur", il);
|
13187
13248
|
|
13188
|
-
|
13189
|
-
|
13249
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
|
13250
|
+
cb(Vcur, "Vcur", il);
|
13190
13251
|
|
13191
|
-
|
13192
|
-
|
13252
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13253
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
13193
13254
|
|
13194
|
-
|
13195
|
-
|
13255
|
+
struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
13256
|
+
struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
13196
13257
|
|
13197
|
-
|
13198
|
-
|
13199
|
-
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13200
|
-
cb(kq_b, "kq_b", il);
|
13258
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
13259
|
+
cb(kq, "kq", il);
|
13201
13260
|
|
13202
|
-
|
13203
|
-
|
13261
|
+
struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
|
13262
|
+
struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
|
13263
|
+
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13264
|
+
cb(kq_b, "kq_b", il);
|
13204
13265
|
|
13205
|
-
|
13206
|
-
|
13266
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
|
13267
|
+
cb(kq, "kq_soft_max_ext", il);
|
13207
13268
|
|
13208
|
-
|
13209
|
-
|
13269
|
+
struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
13270
|
+
cb(v, "v", il);
|
13210
13271
|
|
13211
|
-
|
13212
|
-
|
13272
|
+
struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
13273
|
+
cb(kqv, "kqv", il);
|
13213
13274
|
|
13214
|
-
|
13215
|
-
|
13275
|
+
struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
13276
|
+
cb(kqv_merged, "kqv_merged", il);
|
13216
13277
|
|
13217
|
-
|
13278
|
+
cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
13279
|
+
cb(cur, "kqv_merged_cont", il);
|
13218
13280
|
|
13219
|
-
|
13220
|
-
cb(cur, "kqv_out", il);
|
13221
|
-
}
|
13281
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13222
13282
|
|
13223
|
-
|
13224
|
-
|
13225
|
-
|
13226
|
-
n_tokens = n_outputs;
|
13227
|
-
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13228
|
-
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13229
|
-
}
|
13283
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
|
13284
|
+
cb(cur, "kqv_out", il);
|
13285
|
+
}
|
13230
13286
|
|
13231
|
-
|
13232
|
-
|
13287
|
+
if (il == n_layer - 1) {
|
13288
|
+
// skip computing output for unused tokens
|
13289
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13290
|
+
n_tokens = n_outputs;
|
13291
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13292
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13293
|
+
}
|
13233
13294
|
|
13234
|
-
|
13235
|
-
|
13236
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13237
|
-
model.layers[il].ffn_norm_enc, NULL,
|
13238
|
-
LLM_NORM_RMS, cb, il);
|
13239
|
-
cb(cur, "ffn_norm", il);
|
13295
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13296
|
+
cb(ffn_inp, "ffn_inp", il);
|
13240
13297
|
|
13241
|
-
|
13242
|
-
|
13243
|
-
|
13244
|
-
|
13245
|
-
|
13246
|
-
|
13247
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13248
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13249
|
-
cb, il);
|
13250
|
-
cb(cur, "ffn_out", il);
|
13251
|
-
}
|
13298
|
+
// feed-forward network
|
13299
|
+
{
|
13300
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13301
|
+
model.layers[il].ffn_norm_enc, NULL,
|
13302
|
+
LLM_NORM_RMS, cb, il);
|
13303
|
+
cb(cur, "ffn_norm", il);
|
13252
13304
|
|
13253
|
-
|
13305
|
+
// T5 uses relu, flan-T5 uses gelu-gated
|
13306
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13307
|
+
model.layers[il].ffn_up_enc, NULL, NULL,
|
13308
|
+
model.layers[il].ffn_gate_enc, NULL, NULL,
|
13309
|
+
model.layers[il].ffn_down_enc, NULL, NULL,
|
13310
|
+
NULL,
|
13311
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13312
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13313
|
+
cb, il);
|
13254
13314
|
cb(cur, "ffn_out", il);
|
13315
|
+
}
|
13255
13316
|
|
13256
|
-
|
13257
|
-
|
13258
|
-
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13259
|
-
}
|
13260
|
-
cb(cur, "l_out", il);
|
13317
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13318
|
+
cb(cur, "ffn_out", il);
|
13261
13319
|
|
13262
|
-
|
13263
|
-
|
13320
|
+
lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
13321
|
+
if (layer_dir != nullptr) {
|
13322
|
+
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13264
13323
|
}
|
13324
|
+
cb(cur, "l_out", il);
|
13265
13325
|
|
13266
|
-
|
13267
|
-
|
13326
|
+
// input for next layer
|
13327
|
+
inpL = cur;
|
13328
|
+
}
|
13268
13329
|
|
13269
|
-
|
13270
|
-
|
13271
|
-
LLM_NORM_RMS, cb, -1);
|
13272
|
-
cb(cur, "result_norm", -1);
|
13273
|
-
} else {
|
13274
|
-
LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
|
13330
|
+
cur = inpL;
|
13331
|
+
cb(cur, "result_embd", -1);
|
13275
13332
|
|
13276
|
-
|
13277
|
-
|
13333
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13334
|
+
model.output_norm_enc, NULL,
|
13335
|
+
LLM_NORM_RMS, cb, -1);
|
13336
|
+
cb(cur, "result_norm", -1);
|
13278
13337
|
|
13279
|
-
|
13280
|
-
struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
|
13338
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13281
13339
|
|
13282
|
-
|
13283
|
-
|
13340
|
+
return gf;
|
13341
|
+
}
|
13284
13342
|
|
13285
|
-
|
13286
|
-
|
13287
|
-
model.layers[il].attn_norm, NULL,
|
13288
|
-
LLM_NORM_RMS, cb, il);
|
13289
|
-
cb(cur, "attn_norm", il);
|
13343
|
+
struct lm_ggml_cgraph * build_t5_decoder() {
|
13344
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13290
13345
|
|
13291
|
-
|
13292
|
-
|
13293
|
-
struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
13294
|
-
cb(Qcur, "Qcur", il);
|
13346
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
13347
|
+
int32_t n_tokens = this->n_tokens;
|
13295
13348
|
|
13296
|
-
|
13297
|
-
|
13349
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13350
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
13351
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13298
13352
|
|
13299
|
-
|
13300
|
-
|
13353
|
+
struct lm_ggml_tensor * cur;
|
13354
|
+
struct lm_ggml_tensor * inpL;
|
13301
13355
|
|
13302
|
-
|
13356
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13303
13357
|
|
13304
|
-
|
13305
|
-
|
13306
|
-
n_embd_head_k, n_kv, n_head_kv,
|
13307
|
-
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
13308
|
-
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
13309
|
-
0);
|
13310
|
-
cb(k, "k", il);
|
13358
|
+
LM_GGML_ASSERT(!lctx.is_encoding);
|
13359
|
+
LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
|
13311
13360
|
|
13312
|
-
|
13313
|
-
|
13314
|
-
n_kv, n_embd_head_v, n_head_kv,
|
13315
|
-
lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
|
13316
|
-
lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
|
13317
|
-
0);
|
13318
|
-
cb(v, "v", il);
|
13361
|
+
struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc();
|
13362
|
+
struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
|
13319
13363
|
|
13320
|
-
|
13364
|
+
struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
|
13365
|
+
struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
|
13366
|
+
|
13367
|
+
for (int il = 0; il < n_layer; ++il) {
|
13368
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13321
13369
|
|
13322
|
-
|
13370
|
+
// norm
|
13371
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13372
|
+
model.layers[il].attn_norm, NULL,
|
13373
|
+
LLM_NORM_RMS, cb, il);
|
13374
|
+
cb(cur, "attn_norm", il);
|
13323
13375
|
|
13324
|
-
|
13325
|
-
|
13376
|
+
// self-attention
|
13377
|
+
{
|
13378
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13379
|
+
cb(Qcur, "Qcur", il);
|
13326
13380
|
|
13327
|
-
|
13328
|
-
|
13329
|
-
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13330
|
-
cb(kq_b, "kq_b", il);
|
13381
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13382
|
+
cb(Kcur, "Kcur", il);
|
13331
13383
|
|
13332
|
-
|
13333
|
-
|
13384
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13385
|
+
cb(Vcur, "Vcur", il);
|
13334
13386
|
|
13335
|
-
|
13336
|
-
cb(kqv, "kqv", il);
|
13387
|
+
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
13337
13388
|
|
13338
|
-
|
13339
|
-
|
13389
|
+
struct lm_ggml_tensor * k =
|
13390
|
+
lm_ggml_view_3d(ctx0, kv_self.k_l[il],
|
13391
|
+
n_embd_head_k, n_kv, n_head_kv,
|
13392
|
+
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
13393
|
+
lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
13394
|
+
0);
|
13395
|
+
cb(k, "k", il);
|
13340
13396
|
|
13341
|
-
|
13342
|
-
|
13397
|
+
struct lm_ggml_tensor * v =
|
13398
|
+
lm_ggml_view_3d(ctx0, kv_self.v_l[il],
|
13399
|
+
n_kv, n_embd_head_v, n_head_kv,
|
13400
|
+
lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
|
13401
|
+
lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
|
13402
|
+
0);
|
13403
|
+
cb(v, "v", il);
|
13343
13404
|
|
13344
|
-
|
13405
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13345
13406
|
|
13346
|
-
|
13347
|
-
cb(cur, "kqv_out", il);
|
13348
|
-
}
|
13407
|
+
struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
13349
13408
|
|
13350
|
-
|
13351
|
-
cb(
|
13409
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
13410
|
+
cb(kq, "kq", il);
|
13352
13411
|
|
13353
|
-
struct lm_ggml_tensor *
|
13412
|
+
struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
|
13413
|
+
struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
|
13414
|
+
struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
|
13415
|
+
cb(kq_b, "kq_b", il);
|
13354
13416
|
|
13355
|
-
|
13356
|
-
|
13357
|
-
model.layers[il].attn_norm_cross, NULL,
|
13358
|
-
LLM_NORM_RMS, cb, il);
|
13359
|
-
cb(cur, "attn_norm_cross", il);
|
13417
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
|
13418
|
+
cb(kq, "kq_soft_max_ext", il);
|
13360
13419
|
|
13361
|
-
|
13362
|
-
|
13363
|
-
struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
|
13364
|
-
cb(Qcur, "Qcur", il);
|
13420
|
+
struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
|
13421
|
+
cb(kqv, "kqv", il);
|
13365
13422
|
|
13366
|
-
|
13367
|
-
|
13423
|
+
struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
13424
|
+
cb(kqv_merged, "kqv_merged", il);
|
13368
13425
|
|
13369
|
-
|
13370
|
-
|
13426
|
+
cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
13427
|
+
cb(cur, "kqv_merged_cont", il);
|
13371
13428
|
|
13372
|
-
|
13373
|
-
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
|
13429
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13374
13430
|
|
13375
|
-
|
13376
|
-
|
13431
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
13432
|
+
cb(cur, "kqv_out", il);
|
13433
|
+
}
|
13377
13434
|
|
13378
|
-
|
13379
|
-
|
13435
|
+
cur = lm_ggml_add(ctx0, cur, inpSA);
|
13436
|
+
cb(cur, "cross_inp", il);
|
13380
13437
|
|
13381
|
-
|
13382
|
-
cb(kq, "kq_soft_max_ext", il);
|
13438
|
+
struct lm_ggml_tensor * inpCA = cur;
|
13383
13439
|
|
13384
|
-
|
13385
|
-
|
13440
|
+
// norm
|
13441
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13442
|
+
model.layers[il].attn_norm_cross, NULL,
|
13443
|
+
LLM_NORM_RMS, cb, il);
|
13444
|
+
cb(cur, "attn_norm_cross", il);
|
13386
13445
|
|
13387
|
-
|
13388
|
-
|
13446
|
+
// cross-attention
|
13447
|
+
{
|
13448
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
|
13449
|
+
cb(Qcur, "Qcur", il);
|
13389
13450
|
|
13390
|
-
|
13391
|
-
|
13451
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
|
13452
|
+
cb(Kcur, "Kcur", il);
|
13392
13453
|
|
13393
|
-
|
13394
|
-
|
13454
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
|
13455
|
+
cb(Vcur, "Vcur", il);
|
13395
13456
|
|
13396
|
-
|
13457
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13458
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
|
13397
13459
|
|
13398
|
-
|
13399
|
-
|
13400
|
-
}
|
13460
|
+
struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
13461
|
+
struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
13401
13462
|
|
13402
|
-
|
13403
|
-
|
13404
|
-
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13405
|
-
n_tokens = n_outputs;
|
13406
|
-
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13407
|
-
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13408
|
-
inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
13409
|
-
}
|
13463
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
13464
|
+
cb(kq, "kq", il);
|
13410
13465
|
|
13411
|
-
|
13412
|
-
cb(
|
13466
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
|
13467
|
+
cb(kq, "kq_soft_max_ext", il);
|
13413
13468
|
|
13414
|
-
|
13415
|
-
|
13416
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13417
|
-
model.layers[il].ffn_norm, NULL,
|
13418
|
-
LLM_NORM_RMS, cb, il);
|
13419
|
-
cb(cur, "ffn_norm", il);
|
13469
|
+
struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
|
13470
|
+
cb(v, "v", il);
|
13420
13471
|
|
13421
|
-
|
13422
|
-
|
13423
|
-
model.layers[il].ffn_up, NULL, NULL,
|
13424
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
13425
|
-
model.layers[il].ffn_down, NULL, NULL,
|
13426
|
-
NULL,
|
13427
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13428
|
-
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13429
|
-
cb, il);
|
13430
|
-
cb(cur, "ffn_out", il);
|
13431
|
-
}
|
13472
|
+
struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
|
13473
|
+
cb(kqv, "kqv", il);
|
13432
13474
|
|
13433
|
-
|
13434
|
-
cb(
|
13475
|
+
struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
13476
|
+
cb(kqv_merged, "kqv_merged", il);
|
13435
13477
|
|
13436
|
-
|
13437
|
-
|
13438
|
-
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13439
|
-
}
|
13440
|
-
cb(cur, "l_out", il);
|
13478
|
+
cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
13479
|
+
cb(cur, "kqv_merged_cont", il);
|
13441
13480
|
|
13442
|
-
|
13443
|
-
|
13481
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13482
|
+
|
13483
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
|
13484
|
+
cb(cur, "kqv_out", il);
|
13444
13485
|
}
|
13445
13486
|
|
13446
|
-
|
13447
|
-
|
13487
|
+
if (il == n_layer - 1) {
|
13488
|
+
// skip computing output for unused tokens
|
13489
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13490
|
+
n_tokens = n_outputs;
|
13491
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13492
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13493
|
+
inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
13494
|
+
}
|
13448
13495
|
|
13449
|
-
|
13450
|
-
|
13451
|
-
|
13452
|
-
|
13496
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA);
|
13497
|
+
cb(ffn_inp, "ffn_inp", il);
|
13498
|
+
|
13499
|
+
// feed-forward network
|
13500
|
+
{
|
13501
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13502
|
+
model.layers[il].ffn_norm, NULL,
|
13503
|
+
LLM_NORM_RMS, cb, il);
|
13504
|
+
cb(cur, "ffn_norm", il);
|
13505
|
+
|
13506
|
+
// T5 uses relu, flan-T5 uses gelu-gated
|
13507
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13508
|
+
model.layers[il].ffn_up, NULL, NULL,
|
13509
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
13510
|
+
model.layers[il].ffn_down, NULL, NULL,
|
13511
|
+
NULL,
|
13512
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
13513
|
+
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
13514
|
+
cb, il);
|
13515
|
+
cb(cur, "ffn_out", il);
|
13516
|
+
}
|
13517
|
+
|
13518
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13519
|
+
cb(cur, "ffn_out", il);
|
13520
|
+
|
13521
|
+
lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
13522
|
+
if (layer_dir != nullptr) {
|
13523
|
+
cur = lm_ggml_add(ctx0, cur, layer_dir);
|
13524
|
+
}
|
13525
|
+
cb(cur, "l_out", il);
|
13453
13526
|
|
13454
|
-
//
|
13455
|
-
|
13456
|
-
cb(cur, "result_output", -1);
|
13527
|
+
// input for next layer
|
13528
|
+
inpL = cur;
|
13457
13529
|
}
|
13458
13530
|
|
13531
|
+
cur = inpL;
|
13532
|
+
cb(cur, "result_embd", -1);
|
13533
|
+
|
13534
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13535
|
+
model.output_norm, NULL,
|
13536
|
+
LLM_NORM_RMS, cb, -1);
|
13537
|
+
cb(cur, "result_norm", -1);
|
13538
|
+
|
13539
|
+
// lm_head
|
13540
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13541
|
+
cb(cur, "result_output", -1);
|
13542
|
+
|
13459
13543
|
lm_ggml_build_forward_expand(gf, cur);
|
13460
13544
|
|
13461
13545
|
return gf;
|
@@ -13907,7 +13991,15 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
13907
13991
|
} break;
|
13908
13992
|
case LLM_ARCH_T5:
|
13909
13993
|
{
|
13910
|
-
|
13994
|
+
if (lctx.is_encoding) {
|
13995
|
+
result = llm.build_t5_encoder();
|
13996
|
+
} else {
|
13997
|
+
result = llm.build_t5_decoder();
|
13998
|
+
}
|
13999
|
+
} break;
|
14000
|
+
case LLM_ARCH_T5ENCODER:
|
14001
|
+
{
|
14002
|
+
result = llm.build_t5_encoder();
|
13911
14003
|
} break;
|
13912
14004
|
case LLM_ARCH_JAIS:
|
13913
14005
|
{
|
@@ -14355,7 +14447,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
14355
14447
|
|
14356
14448
|
// TODO: use a per-batch flag for logits presence instead
|
14357
14449
|
const bool has_logits = !cparams.embeddings;
|
14358
|
-
const bool has_embd =
|
14450
|
+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
14359
14451
|
|
14360
14452
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
14361
14453
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
@@ -14838,9 +14930,24 @@ static int llama_encode_internal(
|
|
14838
14930
|
lm_ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
14839
14931
|
|
14840
14932
|
// the output embeddings after the final encoder normalization
|
14841
|
-
struct lm_ggml_tensor * embd =
|
14933
|
+
struct lm_ggml_tensor * embd = nullptr;
|
14842
14934
|
|
14843
|
-
|
14935
|
+
// there are two cases here
|
14936
|
+
if (llama_model_has_decoder(&lctx.model)) {
|
14937
|
+
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
14938
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
14939
|
+
LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
14940
|
+
} else {
|
14941
|
+
// second case is an encoder-only T5 model
|
14942
|
+
if (cparams.embeddings) {
|
14943
|
+
// only output embeddings if required
|
14944
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
14945
|
+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
14946
|
+
embd = gf->nodes[gf->n_nodes - 2];
|
14947
|
+
}
|
14948
|
+
LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
14949
|
+
}
|
14950
|
+
}
|
14844
14951
|
|
14845
14952
|
lm_ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
14846
14953
|
|
@@ -14853,20 +14960,54 @@ static int llama_encode_internal(
|
|
14853
14960
|
lm_ggml_backend_t backend_embd = lm_ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
14854
14961
|
LM_GGML_ASSERT(backend_embd != nullptr);
|
14855
14962
|
|
14856
|
-
|
14857
|
-
|
14963
|
+
if (llama_model_has_decoder(&lctx.model)) {
|
14964
|
+
lctx.embd_enc.resize(n_tokens*n_embd);
|
14965
|
+
float * embd_out = lctx.embd_enc.data();
|
14858
14966
|
|
14859
|
-
|
14860
|
-
float * embd_out = lctx.embd_enc.data();
|
14967
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
|
14861
14968
|
|
14862
|
-
|
14969
|
+
// remember the sequence ids used during the encoding - needed for cross attention later
|
14970
|
+
lctx.seq_ids_enc.resize(n_tokens);
|
14971
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
14972
|
+
for (int s = 0; s < batch.n_seq_id[i]; s++) {
|
14973
|
+
llama_seq_id seq_id = batch.seq_id[i][s];
|
14974
|
+
lctx.seq_ids_enc[i].insert(seq_id);
|
14975
|
+
}
|
14976
|
+
}
|
14977
|
+
} else {
|
14978
|
+
LM_GGML_ASSERT(lctx.embd != nullptr);
|
14863
14979
|
|
14864
|
-
|
14865
|
-
|
14866
|
-
|
14867
|
-
|
14868
|
-
|
14869
|
-
|
14980
|
+
switch (cparams.pooling_type) {
|
14981
|
+
case LLAMA_POOLING_TYPE_NONE:
|
14982
|
+
{
|
14983
|
+
// extract token embeddings
|
14984
|
+
LM_GGML_ASSERT(lctx.embd != nullptr);
|
14985
|
+
float * embd_out = lctx.embd;
|
14986
|
+
|
14987
|
+
LM_GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
|
14988
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
|
14989
|
+
} break;
|
14990
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
14991
|
+
case LLAMA_POOLING_TYPE_CLS:
|
14992
|
+
case LLAMA_POOLING_TYPE_LAST:
|
14993
|
+
{
|
14994
|
+
// extract sequence embeddings
|
14995
|
+
auto & embd_seq_out = lctx.embd_seq;
|
14996
|
+
embd_seq_out.clear();
|
14997
|
+
|
14998
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
14999
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
15000
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
15001
|
+
continue;
|
15002
|
+
}
|
15003
|
+
embd_seq_out[seq_id].resize(n_embd);
|
15004
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
15005
|
+
}
|
15006
|
+
} break;
|
15007
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
15008
|
+
{
|
15009
|
+
LM_GGML_ABORT("unknown pooling type");
|
15010
|
+
}
|
14870
15011
|
}
|
14871
15012
|
}
|
14872
15013
|
}
|
@@ -15302,7 +15443,7 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
15302
15443
|
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
15303
15444
|
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
15304
15445
|
if (n_expert > 1) {
|
15305
|
-
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but
|
15446
|
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
15306
15447
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
15307
15448
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
15308
15449
|
// tensor name.
|
@@ -16576,6 +16717,8 @@ struct llama_context * llama_new_context_with_model(
|
|
16576
16717
|
|
16577
16718
|
ctx->sampling.rng = std::mt19937(params.seed);
|
16578
16719
|
ctx->logits_all = params.logits_all;
|
16720
|
+
// build worst-case graph for encoder if a model contains encoder
|
16721
|
+
ctx->is_encoding = llama_model_has_encoder(model);
|
16579
16722
|
|
16580
16723
|
uint32_t kv_size = cparams.n_ctx;
|
16581
16724
|
lm_ggml_type type_k = params.type_k;
|
@@ -16890,6 +17033,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16890
17033
|
case LLM_ARCH_MAMBA:
|
16891
17034
|
case LLM_ARCH_JINA_BERT_V2:
|
16892
17035
|
case LLM_ARCH_T5:
|
17036
|
+
case LLM_ARCH_T5ENCODER:
|
16893
17037
|
case LLM_ARCH_JAIS:
|
16894
17038
|
return LLAMA_ROPE_TYPE_NONE;
|
16895
17039
|
|
@@ -17037,8 +17181,16 @@ struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const
|
|
17037
17181
|
|
17038
17182
|
bool llama_model_has_encoder(const struct llama_model * model) {
|
17039
17183
|
switch (model->arch) {
|
17040
|
-
case LLM_ARCH_T5:
|
17041
|
-
|
17184
|
+
case LLM_ARCH_T5: return true;
|
17185
|
+
case LLM_ARCH_T5ENCODER: return true;
|
17186
|
+
default: return false;
|
17187
|
+
}
|
17188
|
+
}
|
17189
|
+
|
17190
|
+
bool llama_model_has_decoder(const struct llama_model * model) {
|
17191
|
+
switch (model->arch) {
|
17192
|
+
case LLM_ARCH_T5ENCODER: return false;
|
17193
|
+
default: return true;
|
17042
17194
|
}
|
17043
17195
|
}
|
17044
17196
|
|
@@ -17341,6 +17493,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
17341
17493
|
// TODO: replace all non-fatal assertions with returned errors or exceptions
|
17342
17494
|
struct llama_data_write {
|
17343
17495
|
virtual void write(const void * src, size_t size) = 0;
|
17496
|
+
virtual void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) = 0;
|
17344
17497
|
virtual size_t get_size_written() = 0;
|
17345
17498
|
virtual ~llama_data_write() = default;
|
17346
17499
|
|
@@ -17463,9 +17616,8 @@ struct llama_data_write {
|
|
17463
17616
|
// Read each range of cells of k_size length each into tmp_buf and write out
|
17464
17617
|
for (const auto & range : cell_ranges) {
|
17465
17618
|
const size_t range_size = range.second - range.first;
|
17466
|
-
|
17467
|
-
|
17468
|
-
write(tmp_buf.data(), tmp_buf.size());
|
17619
|
+
const size_t buf_size = range_size * k_size_row;
|
17620
|
+
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
|
17469
17621
|
}
|
17470
17622
|
}
|
17471
17623
|
|
@@ -17484,9 +17636,8 @@ struct llama_data_write {
|
|
17484
17636
|
// Read each range of cells of v_size length each into tmp_buf and write out
|
17485
17637
|
for (const auto & range : cell_ranges) {
|
17486
17638
|
const size_t range_size = range.second - range.first;
|
17487
|
-
|
17488
|
-
|
17489
|
-
write(tmp_buf.data(), tmp_buf.size());
|
17639
|
+
const size_t buf_size = range_size * v_size_row;
|
17640
|
+
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
|
17490
17641
|
}
|
17491
17642
|
}
|
17492
17643
|
} else {
|
@@ -17512,9 +17663,8 @@ struct llama_data_write {
|
|
17512
17663
|
for (const auto & range : cell_ranges) {
|
17513
17664
|
const size_t range_size = range.second - range.first;
|
17514
17665
|
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
17515
|
-
|
17516
|
-
|
17517
|
-
write(tmp_buf.data(), tmp_buf.size());
|
17666
|
+
const size_t buf_size = range_size * v_size_el;
|
17667
|
+
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
|
17518
17668
|
}
|
17519
17669
|
}
|
17520
17670
|
}
|
@@ -17873,12 +18023,14 @@ struct llama_data_write_dummy : llama_data_write {
|
|
17873
18023
|
|
17874
18024
|
llama_data_write_dummy() {}
|
17875
18025
|
|
17876
|
-
// TODO: avoid unnecessary calls to lm_ggml_backend_tensor_get in a dummy context
|
17877
|
-
|
17878
18026
|
void write(const void * /* src */, size_t size) override {
|
17879
18027
|
size_written += size;
|
17880
18028
|
}
|
17881
18029
|
|
18030
|
+
void write_tensor_data(const struct lm_ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
|
18031
|
+
size_written += size;
|
18032
|
+
}
|
18033
|
+
|
17882
18034
|
size_t get_size_written() override {
|
17883
18035
|
return size_written;
|
17884
18036
|
}
|
@@ -17901,6 +18053,16 @@ struct llama_data_write_buffer : llama_data_write {
|
|
17901
18053
|
buf_size -= size;
|
17902
18054
|
}
|
17903
18055
|
|
18056
|
+
void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
|
18057
|
+
if (size > buf_size) {
|
18058
|
+
throw std::runtime_error("unexpectedly reached end of buffer");
|
18059
|
+
}
|
18060
|
+
lm_ggml_backend_tensor_get(tensor, ptr, offset, size);
|
18061
|
+
ptr += size;
|
18062
|
+
size_written += size;
|
18063
|
+
buf_size -= size;
|
18064
|
+
}
|
18065
|
+
|
17904
18066
|
size_t get_size_written() override {
|
17905
18067
|
return size_written;
|
17906
18068
|
}
|
@@ -17936,6 +18098,7 @@ struct llama_data_read_buffer : llama_data_read {
|
|
17936
18098
|
struct llama_data_write_file : llama_data_write {
|
17937
18099
|
llama_file * file;
|
17938
18100
|
size_t size_written = 0;
|
18101
|
+
std::vector<uint8_t> temp_buffer;
|
17939
18102
|
|
17940
18103
|
llama_data_write_file(llama_file * f) : file(f) {}
|
17941
18104
|
|
@@ -17944,6 +18107,12 @@ struct llama_data_write_file : llama_data_write {
|
|
17944
18107
|
size_written += size;
|
17945
18108
|
}
|
17946
18109
|
|
18110
|
+
void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
|
18111
|
+
temp_buffer.resize(size);
|
18112
|
+
lm_ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
|
18113
|
+
write(temp_buffer.data(), temp_buffer.size());
|
18114
|
+
}
|
18115
|
+
|
17947
18116
|
size_t get_size_written() override {
|
17948
18117
|
return size_written;
|
17949
18118
|
}
|