cui-llama.rn 1.0.6 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -132,20 +132,6 @@ static std::string trim(const std::string & str) {
132
132
  return str.substr(start, end - start);
133
133
  }
134
134
 
135
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
136
- std::string result;
137
- for (size_t pos = 0; ; pos += search.length()) {
138
- auto new_pos = s.find(search, pos);
139
- if (new_pos == std::string::npos) {
140
- result += s.substr(pos, s.size() - pos);
141
- break;
142
- }
143
- result += s.substr(pos, new_pos - pos) + replace;
144
- pos = new_pos;
145
- }
146
- s = std::move(result);
147
- }
148
-
149
135
  static bool is_float_close(float a, float b, float abs_tol) {
150
136
  // Check for non-negative tolerance
151
137
  if (abs_tol < 0.0) {
@@ -233,6 +219,7 @@ enum llm_arch {
233
219
  LLM_ARCH_CHATGLM,
234
220
  LLM_ARCH_BITNET,
235
221
  LLM_ARCH_T5,
222
+ LLM_ARCH_T5ENCODER,
236
223
  LLM_ARCH_JAIS,
237
224
  LLM_ARCH_UNKNOWN,
238
225
  };
@@ -277,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
277
264
  { LLM_ARCH_CHATGLM, "chatglm" },
278
265
  { LLM_ARCH_BITNET, "bitnet" },
279
266
  { LLM_ARCH_T5, "t5" },
267
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
280
268
  { LLM_ARCH_JAIS, "jais" },
281
269
  { LLM_ARCH_UNKNOWN, "(unknown)" },
282
270
  };
@@ -373,6 +361,7 @@ enum llm_kv {
373
361
  LLM_KV_TOKENIZER_SUFFIX_ID,
374
362
  LLM_KV_TOKENIZER_MIDDLE_ID,
375
363
  LLM_KV_TOKENIZER_EOT_ID,
364
+ LLM_KV_TOKENIZER_EOM_ID,
376
365
 
377
366
  LLM_KV_ADAPTER_TYPE,
378
367
  LLM_KV_ADAPTER_LORA_ALPHA,
@@ -470,6 +459,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
470
459
  { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
471
460
  { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
472
461
  { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
462
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
473
463
 
474
464
  { LLM_KV_ADAPTER_TYPE, "adapter.type" },
475
465
  { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
@@ -1284,6 +1274,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1284
1274
  { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
1285
1275
  },
1286
1276
  },
1277
+ {
1278
+ LLM_ARCH_T5ENCODER,
1279
+ {
1280
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1281
+ { LLM_TENSOR_OUTPUT, "output" },
1282
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
1283
+ { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
1284
+ { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
1285
+ { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
1286
+ { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
1287
+ { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
1288
+ { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
1289
+ { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
1290
+ { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
1291
+ { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
1292
+ { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
1293
+ },
1294
+ },
1287
1295
  {
1288
1296
  LLM_ARCH_JAIS,
1289
1297
  {
@@ -4980,6 +4988,7 @@ static void llm_load_hparams(
4980
4988
  hparams.attn_soft_cap = true;
4981
4989
 
4982
4990
  switch (hparams.n_layer) {
4991
+ case 26: model.type = e_model::MODEL_2B; break;
4983
4992
  case 42: model.type = e_model::MODEL_9B; break;
4984
4993
  case 46: model.type = e_model::MODEL_27B; break;
4985
4994
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -5209,6 +5218,12 @@ static void llm_load_hparams(
5209
5218
  default: model.type = e_model::MODEL_UNKNOWN;
5210
5219
  }
5211
5220
  } break;
5221
+ case LLM_ARCH_T5ENCODER:
5222
+ {
5223
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5224
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
5225
+ model.type = e_model::MODEL_UNKNOWN;
5226
+ } break;
5212
5227
  case LLM_ARCH_JAIS:
5213
5228
  {
5214
5229
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5596,6 +5611,7 @@ static void llm_load_vocab(
5596
5611
  { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
5597
5612
  { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
5598
5613
  { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
5614
+ { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
5599
5615
  };
5600
5616
 
5601
5617
  for (const auto & it : special_token_types) {
@@ -5648,6 +5664,17 @@ static void llm_load_vocab(
5648
5664
  }
5649
5665
  }
5650
5666
  }
5667
+
5668
+ // find EOM token: "<|eom_id|>"
5669
+ //
5670
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
5671
+ // for now, we apply this workaround to find the EOM token based on its text
5672
+ if (vocab.special_eom_id == -1) {
5673
+ const auto & t = vocab.token_to_id.find("<|eom_id|>");
5674
+ if (t != vocab.token_to_id.end()) {
5675
+ vocab.special_eom_id = t->second;
5676
+ }
5677
+ }
5651
5678
  }
5652
5679
 
5653
5680
  // build special tokens cache
@@ -7431,6 +7458,42 @@ static bool llm_load_tensors(
7431
7458
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
7432
7459
  }
7433
7460
  } break;
7461
+ case LLM_ARCH_T5ENCODER:
7462
+ {
7463
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
7464
+
7465
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7466
+
7467
+ // output
7468
+ {
7469
+ model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
7470
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
7471
+ // if output is NULL, init from the input tok embed
7472
+ if (model.output == NULL) {
7473
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
7474
+ }
7475
+ }
7476
+
7477
+ for (int i = 0; i < n_layer; ++i) {
7478
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7479
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7480
+
7481
+ auto & layer = model.layers[i];
7482
+
7483
+ layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
7484
+ layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
7485
+
7486
+ layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
7487
+ layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
7488
+ layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
7489
+ layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
7490
+
7491
+ layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
7492
+ layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
7493
+ layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
7494
+ layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
7495
+ }
7496
+ } break;
7434
7497
  case LLM_ARCH_JAIS:
7435
7498
  {
7436
7499
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -11747,6 +11810,7 @@ struct llm_build_context {
11747
11810
 
11748
11811
  // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
11749
11812
  switch (model.type) {
11813
+ case e_model::MODEL_2B:
11750
11814
  case e_model::MODEL_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
11751
11815
  case e_model::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
11752
11816
  default: LM_GGML_ABORT("fatal error");
@@ -13144,7 +13208,7 @@ struct llm_build_context {
13144
13208
  return gf;
13145
13209
  }
13146
13210
 
13147
- struct lm_ggml_cgraph * build_t5() {
13211
+ struct lm_ggml_cgraph * build_t5_encoder() {
13148
13212
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13149
13213
 
13150
13214
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -13159,303 +13223,323 @@ struct llm_build_context {
13159
13223
 
13160
13224
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13161
13225
 
13162
- if (lctx.is_encoding) {
13163
- struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
13164
-
13165
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13166
- struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
13226
+ LM_GGML_ASSERT(lctx.is_encoding);
13227
+ struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
13167
13228
 
13168
- for (int il = 0; il < n_layer; ++il) {
13169
- struct lm_ggml_tensor * inpSA = inpL;
13229
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13230
+ struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
13170
13231
 
13171
- // norm
13172
- cur = llm_build_norm(ctx0, inpL, hparams,
13173
- model.layers[il].attn_norm_enc, NULL,
13174
- LLM_NORM_RMS, cb, il);
13175
- cb(cur, "attn_norm", il);
13232
+ for (int il = 0; il < n_layer; ++il) {
13233
+ struct lm_ggml_tensor * inpSA = inpL;
13176
13234
 
13177
- // self-attention
13178
- {
13179
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
13180
- cb(Qcur, "Qcur", il);
13235
+ // norm
13236
+ cur = llm_build_norm(ctx0, inpL, hparams,
13237
+ model.layers[il].attn_norm_enc, NULL,
13238
+ LLM_NORM_RMS, cb, il);
13239
+ cb(cur, "attn_norm", il);
13181
13240
 
13182
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
13183
- cb(Kcur, "Kcur", il);
13241
+ // self-attention
13242
+ {
13243
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
13244
+ cb(Qcur, "Qcur", il);
13184
13245
 
13185
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
13186
- cb(Vcur, "Vcur", il);
13246
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
13247
+ cb(Kcur, "Kcur", il);
13187
13248
 
13188
- Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13189
- Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13249
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
13250
+ cb(Vcur, "Vcur", il);
13190
13251
 
13191
- struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13192
- struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13252
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13253
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13193
13254
 
13194
- struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13195
- cb(kq, "kq", il);
13255
+ struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13256
+ struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13196
13257
 
13197
- struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
13198
- struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
13199
- struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13200
- cb(kq_b, "kq_b", il);
13258
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13259
+ cb(kq, "kq", il);
13201
13260
 
13202
- kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
13203
- cb(kq, "kq_soft_max_ext", il);
13261
+ struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
13262
+ struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
13263
+ struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13264
+ cb(kq_b, "kq_b", il);
13204
13265
 
13205
- struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
13206
- cb(v, "v", il);
13266
+ kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
13267
+ cb(kq, "kq_soft_max_ext", il);
13207
13268
 
13208
- struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
13209
- cb(kqv, "kqv", il);
13269
+ struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
13270
+ cb(v, "v", il);
13210
13271
 
13211
- struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13212
- cb(kqv_merged, "kqv_merged", il);
13272
+ struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
13273
+ cb(kqv, "kqv", il);
13213
13274
 
13214
- cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13215
- cb(cur, "kqv_merged_cont", il);
13275
+ struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13276
+ cb(kqv_merged, "kqv_merged", il);
13216
13277
 
13217
- lm_ggml_build_forward_expand(gf, cur);
13278
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13279
+ cb(cur, "kqv_merged_cont", il);
13218
13280
 
13219
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
13220
- cb(cur, "kqv_out", il);
13221
- }
13281
+ lm_ggml_build_forward_expand(gf, cur);
13222
13282
 
13223
- if (il == n_layer - 1) {
13224
- // skip computing output for unused tokens
13225
- struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13226
- n_tokens = n_outputs;
13227
- cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13228
- inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13229
- }
13283
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
13284
+ cb(cur, "kqv_out", il);
13285
+ }
13230
13286
 
13231
- struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13232
- cb(ffn_inp, "ffn_inp", il);
13287
+ if (il == n_layer - 1) {
13288
+ // skip computing output for unused tokens
13289
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13290
+ n_tokens = n_outputs;
13291
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13292
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13293
+ }
13233
13294
 
13234
- // feed-forward network
13235
- {
13236
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13237
- model.layers[il].ffn_norm_enc, NULL,
13238
- LLM_NORM_RMS, cb, il);
13239
- cb(cur, "ffn_norm", il);
13295
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13296
+ cb(ffn_inp, "ffn_inp", il);
13240
13297
 
13241
- // T5 uses relu, flan-T5 uses gelu-gated
13242
- cur = llm_build_ffn(ctx0, lctx, cur,
13243
- model.layers[il].ffn_up_enc, NULL, NULL,
13244
- model.layers[il].ffn_gate_enc, NULL, NULL,
13245
- model.layers[il].ffn_down_enc, NULL, NULL,
13246
- NULL,
13247
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13248
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13249
- cb, il);
13250
- cb(cur, "ffn_out", il);
13251
- }
13298
+ // feed-forward network
13299
+ {
13300
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13301
+ model.layers[il].ffn_norm_enc, NULL,
13302
+ LLM_NORM_RMS, cb, il);
13303
+ cb(cur, "ffn_norm", il);
13252
13304
 
13253
- cur = lm_ggml_add(ctx0, cur, ffn_inp);
13305
+ // T5 uses relu, flan-T5 uses gelu-gated
13306
+ cur = llm_build_ffn(ctx0, lctx, cur,
13307
+ model.layers[il].ffn_up_enc, NULL, NULL,
13308
+ model.layers[il].ffn_gate_enc, NULL, NULL,
13309
+ model.layers[il].ffn_down_enc, NULL, NULL,
13310
+ NULL,
13311
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13312
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13313
+ cb, il);
13254
13314
  cb(cur, "ffn_out", il);
13315
+ }
13255
13316
 
13256
- lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13257
- if (layer_dir != nullptr) {
13258
- cur = lm_ggml_add(ctx0, cur, layer_dir);
13259
- }
13260
- cb(cur, "l_out", il);
13317
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13318
+ cb(cur, "ffn_out", il);
13261
13319
 
13262
- // input for next layer
13263
- inpL = cur;
13320
+ lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13321
+ if (layer_dir != nullptr) {
13322
+ cur = lm_ggml_add(ctx0, cur, layer_dir);
13264
13323
  }
13324
+ cb(cur, "l_out", il);
13265
13325
 
13266
- cur = inpL;
13267
- cb(cur, "result_embd", -1);
13326
+ // input for next layer
13327
+ inpL = cur;
13328
+ }
13268
13329
 
13269
- cur = llm_build_norm(ctx0, cur, hparams,
13270
- model.output_norm_enc, NULL,
13271
- LLM_NORM_RMS, cb, -1);
13272
- cb(cur, "result_norm", -1);
13273
- } else {
13274
- LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
13330
+ cur = inpL;
13331
+ cb(cur, "result_embd", -1);
13275
13332
 
13276
- struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc();
13277
- struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
13333
+ cur = llm_build_norm(ctx0, cur, hparams,
13334
+ model.output_norm_enc, NULL,
13335
+ LLM_NORM_RMS, cb, -1);
13336
+ cb(cur, "result_norm", -1);
13278
13337
 
13279
- struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
13280
- struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
13338
+ lm_ggml_build_forward_expand(gf, cur);
13281
13339
 
13282
- for (int il = 0; il < n_layer; ++il) {
13283
- struct lm_ggml_tensor * inpSA = inpL;
13340
+ return gf;
13341
+ }
13284
13342
 
13285
- // norm
13286
- cur = llm_build_norm(ctx0, inpL, hparams,
13287
- model.layers[il].attn_norm, NULL,
13288
- LLM_NORM_RMS, cb, il);
13289
- cb(cur, "attn_norm", il);
13343
+ struct lm_ggml_cgraph * build_t5_decoder() {
13344
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13290
13345
 
13291
- // self-attention
13292
- {
13293
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
13294
- cb(Qcur, "Qcur", il);
13346
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
13347
+ int32_t n_tokens = this->n_tokens;
13295
13348
 
13296
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
13297
- cb(Kcur, "Kcur", il);
13349
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13350
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
13351
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13298
13352
 
13299
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
13300
- cb(Vcur, "Vcur", il);
13353
+ struct lm_ggml_tensor * cur;
13354
+ struct lm_ggml_tensor * inpL;
13301
13355
 
13302
- llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
13356
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13303
13357
 
13304
- struct lm_ggml_tensor * k =
13305
- lm_ggml_view_3d(ctx0, kv_self.k_l[il],
13306
- n_embd_head_k, n_kv, n_head_kv,
13307
- lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
13308
- lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
13309
- 0);
13310
- cb(k, "k", il);
13358
+ LM_GGML_ASSERT(!lctx.is_encoding);
13359
+ LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
13311
13360
 
13312
- struct lm_ggml_tensor * v =
13313
- lm_ggml_view_3d(ctx0, kv_self.v_l[il],
13314
- n_kv, n_embd_head_v, n_head_kv,
13315
- lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
13316
- lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
13317
- 0);
13318
- cb(v, "v", il);
13361
+ struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc();
13362
+ struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
13319
13363
 
13320
- Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13364
+ struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
13365
+ struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
13366
+
13367
+ for (int il = 0; il < n_layer; ++il) {
13368
+ struct lm_ggml_tensor * inpSA = inpL;
13321
13369
 
13322
- struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13370
+ // norm
13371
+ cur = llm_build_norm(ctx0, inpL, hparams,
13372
+ model.layers[il].attn_norm, NULL,
13373
+ LLM_NORM_RMS, cb, il);
13374
+ cb(cur, "attn_norm", il);
13323
13375
 
13324
- struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13325
- cb(kq, "kq", il);
13376
+ // self-attention
13377
+ {
13378
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13379
+ cb(Qcur, "Qcur", il);
13326
13380
 
13327
- struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
13328
- struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
13329
- struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13330
- cb(kq_b, "kq_b", il);
13381
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13382
+ cb(Kcur, "Kcur", il);
13331
13383
 
13332
- kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
13333
- cb(kq, "kq_soft_max_ext", il);
13384
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13385
+ cb(Vcur, "Vcur", il);
13334
13386
 
13335
- struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
13336
- cb(kqv, "kqv", il);
13387
+ llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
13337
13388
 
13338
- struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13339
- cb(kqv_merged, "kqv_merged", il);
13389
+ struct lm_ggml_tensor * k =
13390
+ lm_ggml_view_3d(ctx0, kv_self.k_l[il],
13391
+ n_embd_head_k, n_kv, n_head_kv,
13392
+ lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
13393
+ lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
13394
+ 0);
13395
+ cb(k, "k", il);
13340
13396
 
13341
- cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13342
- cb(cur, "kqv_merged_cont", il);
13397
+ struct lm_ggml_tensor * v =
13398
+ lm_ggml_view_3d(ctx0, kv_self.v_l[il],
13399
+ n_kv, n_embd_head_v, n_head_kv,
13400
+ lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
13401
+ lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
13402
+ 0);
13403
+ cb(v, "v", il);
13343
13404
 
13344
- lm_ggml_build_forward_expand(gf, cur);
13405
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13345
13406
 
13346
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur);
13347
- cb(cur, "kqv_out", il);
13348
- }
13407
+ struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13349
13408
 
13350
- cur = lm_ggml_add(ctx0, cur, inpSA);
13351
- cb(cur, "cross_inp", il);
13409
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13410
+ cb(kq, "kq", il);
13352
13411
 
13353
- struct lm_ggml_tensor * inpCA = cur;
13412
+ struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
13413
+ struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
13414
+ struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13415
+ cb(kq_b, "kq_b", il);
13354
13416
 
13355
- // norm
13356
- cur = llm_build_norm(ctx0, cur, hparams,
13357
- model.layers[il].attn_norm_cross, NULL,
13358
- LLM_NORM_RMS, cb, il);
13359
- cb(cur, "attn_norm_cross", il);
13417
+ kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
13418
+ cb(kq, "kq_soft_max_ext", il);
13360
13419
 
13361
- // cross-attention
13362
- {
13363
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
13364
- cb(Qcur, "Qcur", il);
13420
+ struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
13421
+ cb(kqv, "kqv", il);
13365
13422
 
13366
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
13367
- cb(Kcur, "Kcur", il);
13423
+ struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13424
+ cb(kqv_merged, "kqv_merged", il);
13368
13425
 
13369
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
13370
- cb(Vcur, "Vcur", il);
13426
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13427
+ cb(cur, "kqv_merged_cont", il);
13371
13428
 
13372
- Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13373
- Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
13429
+ lm_ggml_build_forward_expand(gf, cur);
13374
13430
 
13375
- struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13376
- struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13431
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
13432
+ cb(cur, "kqv_out", il);
13433
+ }
13377
13434
 
13378
- struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13379
- cb(kq, "kq", il);
13435
+ cur = lm_ggml_add(ctx0, cur, inpSA);
13436
+ cb(cur, "cross_inp", il);
13380
13437
 
13381
- kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
13382
- cb(kq, "kq_soft_max_ext", il);
13438
+ struct lm_ggml_tensor * inpCA = cur;
13383
13439
 
13384
- struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
13385
- cb(v, "v", il);
13440
+ // norm
13441
+ cur = llm_build_norm(ctx0, cur, hparams,
13442
+ model.layers[il].attn_norm_cross, NULL,
13443
+ LLM_NORM_RMS, cb, il);
13444
+ cb(cur, "attn_norm_cross", il);
13386
13445
 
13387
- struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
13388
- cb(kqv, "kqv", il);
13446
+ // cross-attention
13447
+ {
13448
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
13449
+ cb(Qcur, "Qcur", il);
13389
13450
 
13390
- struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13391
- cb(kqv_merged, "kqv_merged", il);
13451
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
13452
+ cb(Kcur, "Kcur", il);
13392
13453
 
13393
- cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13394
- cb(cur, "kqv_merged_cont", il);
13454
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
13455
+ cb(Vcur, "Vcur", il);
13395
13456
 
13396
- lm_ggml_build_forward_expand(gf, cur);
13457
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13458
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
13397
13459
 
13398
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
13399
- cb(cur, "kqv_out", il);
13400
- }
13460
+ struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13461
+ struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13401
13462
 
13402
- if (il == n_layer - 1) {
13403
- // skip computing output for unused tokens
13404
- struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13405
- n_tokens = n_outputs;
13406
- cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13407
- inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13408
- inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
13409
- }
13463
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13464
+ cb(kq, "kq", il);
13410
13465
 
13411
- struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA);
13412
- cb(ffn_inp, "ffn_inp", il);
13466
+ kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
13467
+ cb(kq, "kq_soft_max_ext", il);
13413
13468
 
13414
- // feed-forward network
13415
- {
13416
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13417
- model.layers[il].ffn_norm, NULL,
13418
- LLM_NORM_RMS, cb, il);
13419
- cb(cur, "ffn_norm", il);
13469
+ struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
13470
+ cb(v, "v", il);
13420
13471
 
13421
- // T5 uses relu, flan-T5 uses gelu-gated
13422
- cur = llm_build_ffn(ctx0, lctx, cur,
13423
- model.layers[il].ffn_up, NULL, NULL,
13424
- model.layers[il].ffn_gate, NULL, NULL,
13425
- model.layers[il].ffn_down, NULL, NULL,
13426
- NULL,
13427
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13428
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13429
- cb, il);
13430
- cb(cur, "ffn_out", il);
13431
- }
13472
+ struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
13473
+ cb(kqv, "kqv", il);
13432
13474
 
13433
- cur = lm_ggml_add(ctx0, cur, ffn_inp);
13434
- cb(cur, "ffn_out", il);
13475
+ struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13476
+ cb(kqv_merged, "kqv_merged", il);
13435
13477
 
13436
- lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13437
- if (layer_dir != nullptr) {
13438
- cur = lm_ggml_add(ctx0, cur, layer_dir);
13439
- }
13440
- cb(cur, "l_out", il);
13478
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13479
+ cb(cur, "kqv_merged_cont", il);
13441
13480
 
13442
- // input for next layer
13443
- inpL = cur;
13481
+ lm_ggml_build_forward_expand(gf, cur);
13482
+
13483
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
13484
+ cb(cur, "kqv_out", il);
13444
13485
  }
13445
13486
 
13446
- cur = inpL;
13447
- cb(cur, "result_embd", -1);
13487
+ if (il == n_layer - 1) {
13488
+ // skip computing output for unused tokens
13489
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13490
+ n_tokens = n_outputs;
13491
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13492
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13493
+ inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
13494
+ }
13448
13495
 
13449
- cur = llm_build_norm(ctx0, cur, hparams,
13450
- model.output_norm, NULL,
13451
- LLM_NORM_RMS, cb, -1);
13452
- cb(cur, "result_norm", -1);
13496
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA);
13497
+ cb(ffn_inp, "ffn_inp", il);
13498
+
13499
+ // feed-forward network
13500
+ {
13501
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13502
+ model.layers[il].ffn_norm, NULL,
13503
+ LLM_NORM_RMS, cb, il);
13504
+ cb(cur, "ffn_norm", il);
13505
+
13506
+ // T5 uses relu, flan-T5 uses gelu-gated
13507
+ cur = llm_build_ffn(ctx0, lctx, cur,
13508
+ model.layers[il].ffn_up, NULL, NULL,
13509
+ model.layers[il].ffn_gate, NULL, NULL,
13510
+ model.layers[il].ffn_down, NULL, NULL,
13511
+ NULL,
13512
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13513
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13514
+ cb, il);
13515
+ cb(cur, "ffn_out", il);
13516
+ }
13517
+
13518
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13519
+ cb(cur, "ffn_out", il);
13520
+
13521
+ lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13522
+ if (layer_dir != nullptr) {
13523
+ cur = lm_ggml_add(ctx0, cur, layer_dir);
13524
+ }
13525
+ cb(cur, "l_out", il);
13453
13526
 
13454
- // lm_head
13455
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
13456
- cb(cur, "result_output", -1);
13527
+ // input for next layer
13528
+ inpL = cur;
13457
13529
  }
13458
13530
 
13531
+ cur = inpL;
13532
+ cb(cur, "result_embd", -1);
13533
+
13534
+ cur = llm_build_norm(ctx0, cur, hparams,
13535
+ model.output_norm, NULL,
13536
+ LLM_NORM_RMS, cb, -1);
13537
+ cb(cur, "result_norm", -1);
13538
+
13539
+ // lm_head
13540
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13541
+ cb(cur, "result_output", -1);
13542
+
13459
13543
  lm_ggml_build_forward_expand(gf, cur);
13460
13544
 
13461
13545
  return gf;
@@ -13907,7 +13991,15 @@ static struct lm_ggml_cgraph * llama_build_graph(
13907
13991
  } break;
13908
13992
  case LLM_ARCH_T5:
13909
13993
  {
13910
- result = llm.build_t5();
13994
+ if (lctx.is_encoding) {
13995
+ result = llm.build_t5_encoder();
13996
+ } else {
13997
+ result = llm.build_t5_decoder();
13998
+ }
13999
+ } break;
14000
+ case LLM_ARCH_T5ENCODER:
14001
+ {
14002
+ result = llm.build_t5_encoder();
13911
14003
  } break;
13912
14004
  case LLM_ARCH_JAIS:
13913
14005
  {
@@ -14355,7 +14447,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
14355
14447
 
14356
14448
  // TODO: use a per-batch flag for logits presence instead
14357
14449
  const bool has_logits = !cparams.embeddings;
14358
- const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
14450
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
14359
14451
 
14360
14452
  const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
14361
14453
  const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -14838,9 +14930,24 @@ static int llama_encode_internal(
14838
14930
  lm_ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
14839
14931
 
14840
14932
  // the output embeddings after the final encoder normalization
14841
- struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 1];
14933
+ struct lm_ggml_tensor * embd = nullptr;
14842
14934
 
14843
- LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
14935
+ // there are two cases here
14936
+ if (llama_model_has_decoder(&lctx.model)) {
14937
+ // first case is an encoder-decoder T5 model where embeddings are passed to decoder
14938
+ embd = gf->nodes[gf->n_nodes - 1];
14939
+ LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
14940
+ } else {
14941
+ // second case is an encoder-only T5 model
14942
+ if (cparams.embeddings) {
14943
+ // only output embeddings if required
14944
+ embd = gf->nodes[gf->n_nodes - 1];
14945
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
14946
+ embd = gf->nodes[gf->n_nodes - 2];
14947
+ }
14948
+ LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
14949
+ }
14950
+ }
14844
14951
 
14845
14952
  lm_ggml_backend_sched_alloc_graph(lctx.sched, gf);
14846
14953
 
@@ -14853,20 +14960,54 @@ static int llama_encode_internal(
14853
14960
  lm_ggml_backend_t backend_embd = lm_ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
14854
14961
  LM_GGML_ASSERT(backend_embd != nullptr);
14855
14962
 
14856
- // extract token embeddings
14857
- LM_GGML_ASSERT(lctx.embd != nullptr);
14963
+ if (llama_model_has_decoder(&lctx.model)) {
14964
+ lctx.embd_enc.resize(n_tokens*n_embd);
14965
+ float * embd_out = lctx.embd_enc.data();
14858
14966
 
14859
- lctx.embd_enc.resize(n_tokens*n_embd);
14860
- float * embd_out = lctx.embd_enc.data();
14967
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
14861
14968
 
14862
- lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
14969
+ // remember the sequence ids used during the encoding - needed for cross attention later
14970
+ lctx.seq_ids_enc.resize(n_tokens);
14971
+ for (uint32_t i = 0; i < n_tokens; i++) {
14972
+ for (int s = 0; s < batch.n_seq_id[i]; s++) {
14973
+ llama_seq_id seq_id = batch.seq_id[i][s];
14974
+ lctx.seq_ids_enc[i].insert(seq_id);
14975
+ }
14976
+ }
14977
+ } else {
14978
+ LM_GGML_ASSERT(lctx.embd != nullptr);
14863
14979
 
14864
- // remember the sequence ids used during the encoding - needed for cross attention later
14865
- lctx.seq_ids_enc.resize(n_tokens);
14866
- for (uint32_t i = 0; i < n_tokens; i++) {
14867
- for (int s = 0; s < batch.n_seq_id[i]; s++) {
14868
- llama_seq_id seq_id = batch.seq_id[i][s];
14869
- lctx.seq_ids_enc[i].insert(seq_id);
14980
+ switch (cparams.pooling_type) {
14981
+ case LLAMA_POOLING_TYPE_NONE:
14982
+ {
14983
+ // extract token embeddings
14984
+ LM_GGML_ASSERT(lctx.embd != nullptr);
14985
+ float * embd_out = lctx.embd;
14986
+
14987
+ LM_GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
14988
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
14989
+ } break;
14990
+ case LLAMA_POOLING_TYPE_MEAN:
14991
+ case LLAMA_POOLING_TYPE_CLS:
14992
+ case LLAMA_POOLING_TYPE_LAST:
14993
+ {
14994
+ // extract sequence embeddings
14995
+ auto & embd_seq_out = lctx.embd_seq;
14996
+ embd_seq_out.clear();
14997
+
14998
+ for (uint32_t i = 0; i < n_tokens; i++) {
14999
+ const llama_seq_id seq_id = batch.seq_id[i][0];
15000
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
15001
+ continue;
15002
+ }
15003
+ embd_seq_out[seq_id].resize(n_embd);
15004
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
15005
+ }
15006
+ } break;
15007
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
15008
+ {
15009
+ LM_GGML_ABORT("unknown pooling type");
15010
+ }
14870
15011
  }
14871
15012
  }
14872
15013
  }
@@ -15302,7 +15443,7 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
15302
15443
  const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
15303
15444
  auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
15304
15445
  if (n_expert > 1) {
15305
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
15446
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
15306
15447
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
15307
15448
  // for getting the current layer as I initially thought, and we need to resort to parsing the
15308
15449
  // tensor name.
@@ -16576,6 +16717,8 @@ struct llama_context * llama_new_context_with_model(
16576
16717
 
16577
16718
  ctx->sampling.rng = std::mt19937(params.seed);
16578
16719
  ctx->logits_all = params.logits_all;
16720
+ // build worst-case graph for encoder if a model contains encoder
16721
+ ctx->is_encoding = llama_model_has_encoder(model);
16579
16722
 
16580
16723
  uint32_t kv_size = cparams.n_ctx;
16581
16724
  lm_ggml_type type_k = params.type_k;
@@ -16890,6 +17033,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16890
17033
  case LLM_ARCH_MAMBA:
16891
17034
  case LLM_ARCH_JINA_BERT_V2:
16892
17035
  case LLM_ARCH_T5:
17036
+ case LLM_ARCH_T5ENCODER:
16893
17037
  case LLM_ARCH_JAIS:
16894
17038
  return LLAMA_ROPE_TYPE_NONE;
16895
17039
 
@@ -17037,8 +17181,16 @@ struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const
17037
17181
 
17038
17182
  bool llama_model_has_encoder(const struct llama_model * model) {
17039
17183
  switch (model->arch) {
17040
- case LLM_ARCH_T5: return true;
17041
- default: return false;
17184
+ case LLM_ARCH_T5: return true;
17185
+ case LLM_ARCH_T5ENCODER: return true;
17186
+ default: return false;
17187
+ }
17188
+ }
17189
+
17190
+ bool llama_model_has_decoder(const struct llama_model * model) {
17191
+ switch (model->arch) {
17192
+ case LLM_ARCH_T5ENCODER: return false;
17193
+ default: return true;
17042
17194
  }
17043
17195
  }
17044
17196
 
@@ -17341,6 +17493,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
17341
17493
  // TODO: replace all non-fatal assertions with returned errors or exceptions
17342
17494
  struct llama_data_write {
17343
17495
  virtual void write(const void * src, size_t size) = 0;
17496
+ virtual void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) = 0;
17344
17497
  virtual size_t get_size_written() = 0;
17345
17498
  virtual ~llama_data_write() = default;
17346
17499
 
@@ -17463,9 +17616,8 @@ struct llama_data_write {
17463
17616
  // Read each range of cells of k_size length each into tmp_buf and write out
17464
17617
  for (const auto & range : cell_ranges) {
17465
17618
  const size_t range_size = range.second - range.first;
17466
- tmp_buf.resize(range_size * k_size_row);
17467
- lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
17468
- write(tmp_buf.data(), tmp_buf.size());
17619
+ const size_t buf_size = range_size * k_size_row;
17620
+ write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
17469
17621
  }
17470
17622
  }
17471
17623
 
@@ -17484,9 +17636,8 @@ struct llama_data_write {
17484
17636
  // Read each range of cells of v_size length each into tmp_buf and write out
17485
17637
  for (const auto & range : cell_ranges) {
17486
17638
  const size_t range_size = range.second - range.first;
17487
- tmp_buf.resize(range_size * v_size_row);
17488
- lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
17489
- write(tmp_buf.data(), tmp_buf.size());
17639
+ const size_t buf_size = range_size * v_size_row;
17640
+ write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
17490
17641
  }
17491
17642
  }
17492
17643
  } else {
@@ -17512,9 +17663,8 @@ struct llama_data_write {
17512
17663
  for (const auto & range : cell_ranges) {
17513
17664
  const size_t range_size = range.second - range.first;
17514
17665
  const size_t src_offset = (range.first + j * kv_size) * v_size_el;
17515
- tmp_buf.resize(range_size * v_size_el);
17516
- lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
17517
- write(tmp_buf.data(), tmp_buf.size());
17666
+ const size_t buf_size = range_size * v_size_el;
17667
+ write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
17518
17668
  }
17519
17669
  }
17520
17670
  }
@@ -17873,12 +18023,14 @@ struct llama_data_write_dummy : llama_data_write {
17873
18023
 
17874
18024
  llama_data_write_dummy() {}
17875
18025
 
17876
- // TODO: avoid unnecessary calls to lm_ggml_backend_tensor_get in a dummy context
17877
-
17878
18026
  void write(const void * /* src */, size_t size) override {
17879
18027
  size_written += size;
17880
18028
  }
17881
18029
 
18030
+ void write_tensor_data(const struct lm_ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
18031
+ size_written += size;
18032
+ }
18033
+
17882
18034
  size_t get_size_written() override {
17883
18035
  return size_written;
17884
18036
  }
@@ -17901,6 +18053,16 @@ struct llama_data_write_buffer : llama_data_write {
17901
18053
  buf_size -= size;
17902
18054
  }
17903
18055
 
18056
+ void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
18057
+ if (size > buf_size) {
18058
+ throw std::runtime_error("unexpectedly reached end of buffer");
18059
+ }
18060
+ lm_ggml_backend_tensor_get(tensor, ptr, offset, size);
18061
+ ptr += size;
18062
+ size_written += size;
18063
+ buf_size -= size;
18064
+ }
18065
+
17904
18066
  size_t get_size_written() override {
17905
18067
  return size_written;
17906
18068
  }
@@ -17936,6 +18098,7 @@ struct llama_data_read_buffer : llama_data_read {
17936
18098
  struct llama_data_write_file : llama_data_write {
17937
18099
  llama_file * file;
17938
18100
  size_t size_written = 0;
18101
+ std::vector<uint8_t> temp_buffer;
17939
18102
 
17940
18103
  llama_data_write_file(llama_file * f) : file(f) {}
17941
18104
 
@@ -17944,6 +18107,12 @@ struct llama_data_write_file : llama_data_write {
17944
18107
  size_written += size;
17945
18108
  }
17946
18109
 
18110
+ void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
18111
+ temp_buffer.resize(size);
18112
+ lm_ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
18113
+ write(temp_buffer.data(), temp_buffer.size());
18114
+ }
18115
+
17947
18116
  size_t get_size_written() override {
17948
18117
  return size_written;
17949
18118
  }