cui-llama.rn 1.0.7 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -132,20 +132,6 @@ static std::string trim(const std::string & str) {
132
132
  return str.substr(start, end - start);
133
133
  }
134
134
 
135
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
136
- std::string result;
137
- for (size_t pos = 0; ; pos += search.length()) {
138
- auto new_pos = s.find(search, pos);
139
- if (new_pos == std::string::npos) {
140
- result += s.substr(pos, s.size() - pos);
141
- break;
142
- }
143
- result += s.substr(pos, new_pos - pos) + replace;
144
- pos = new_pos;
145
- }
146
- s = std::move(result);
147
- }
148
-
149
135
  static bool is_float_close(float a, float b, float abs_tol) {
150
136
  // Check for non-negative tolerance
151
137
  if (abs_tol < 0.0) {
@@ -233,6 +219,7 @@ enum llm_arch {
233
219
  LLM_ARCH_CHATGLM,
234
220
  LLM_ARCH_BITNET,
235
221
  LLM_ARCH_T5,
222
+ LLM_ARCH_T5ENCODER,
236
223
  LLM_ARCH_JAIS,
237
224
  LLM_ARCH_UNKNOWN,
238
225
  };
@@ -277,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
277
264
  { LLM_ARCH_CHATGLM, "chatglm" },
278
265
  { LLM_ARCH_BITNET, "bitnet" },
279
266
  { LLM_ARCH_T5, "t5" },
267
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
280
268
  { LLM_ARCH_JAIS, "jais" },
281
269
  { LLM_ARCH_UNKNOWN, "(unknown)" },
282
270
  };
@@ -373,6 +361,7 @@ enum llm_kv {
373
361
  LLM_KV_TOKENIZER_SUFFIX_ID,
374
362
  LLM_KV_TOKENIZER_MIDDLE_ID,
375
363
  LLM_KV_TOKENIZER_EOT_ID,
364
+ LLM_KV_TOKENIZER_EOM_ID,
376
365
 
377
366
  LLM_KV_ADAPTER_TYPE,
378
367
  LLM_KV_ADAPTER_LORA_ALPHA,
@@ -470,6 +459,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
470
459
  { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
471
460
  { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
472
461
  { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
462
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
473
463
 
474
464
  { LLM_KV_ADAPTER_TYPE, "adapter.type" },
475
465
  { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
@@ -1284,6 +1274,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1284
1274
  { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
1285
1275
  },
1286
1276
  },
1277
+ {
1278
+ LLM_ARCH_T5ENCODER,
1279
+ {
1280
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1281
+ { LLM_TENSOR_OUTPUT, "output" },
1282
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
1283
+ { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
1284
+ { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
1285
+ { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
1286
+ { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
1287
+ { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
1288
+ { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
1289
+ { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
1290
+ { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
1291
+ { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
1292
+ { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
1293
+ },
1294
+ },
1287
1295
  {
1288
1296
  LLM_ARCH_JAIS,
1289
1297
  {
@@ -3578,13 +3586,8 @@ namespace GGUFMeta {
3578
3586
 
3579
3587
  using llama_buf_map = std::unordered_map<uint32_t, lm_ggml_backend_buffer_t>;
3580
3588
 
3581
- // TODO: update when needed or think of some clever automatic way to do this
3582
- static size_t llama_model_max_nodes(const llama_model & /*model*/) {
3583
- //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
3584
- // return 32768;
3585
- //}
3586
-
3587
- return 8192;
3589
+ static size_t llama_model_max_nodes(const llama_model & model) {
3590
+ return std::max<size_t>(8192, model.tensors_by_name.size()*5);
3588
3591
  }
3589
3592
 
3590
3593
  struct llama_model_loader {
@@ -4904,7 +4907,6 @@ static void llm_load_hparams(
4904
4907
  } break;
4905
4908
  case LLM_ARCH_PHI3:
4906
4909
  {
4907
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
4908
4910
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4909
4911
 
4910
4912
  switch (hparams.n_layer) {
@@ -4913,6 +4915,22 @@ static void llm_load_hparams(
4913
4915
  case 40: model.type = e_model::MODEL_14B; break;
4914
4916
  default: model.type = e_model::MODEL_UNKNOWN;
4915
4917
  }
4918
+
4919
+ // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
4920
+ if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
4921
+ // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
4922
+ hparams.n_swa = 2047;
4923
+ } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
4924
+ // default value for Phi-3-mini-128k-instruct
4925
+ hparams.n_swa = 262144;
4926
+ } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
4927
+ // default value for Phi-3-medium-128k-instruct
4928
+ hparams.n_swa = 131072;
4929
+ }
4930
+ bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
4931
+ if (!found_swa && hparams.n_swa == 0) {
4932
+ throw std::runtime_error("invalid value for sliding_window");
4933
+ }
4916
4934
  } break;
4917
4935
  case LLM_ARCH_PLAMO:
4918
4936
  {
@@ -5210,6 +5228,12 @@ static void llm_load_hparams(
5210
5228
  default: model.type = e_model::MODEL_UNKNOWN;
5211
5229
  }
5212
5230
  } break;
5231
+ case LLM_ARCH_T5ENCODER:
5232
+ {
5233
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5234
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
5235
+ model.type = e_model::MODEL_UNKNOWN;
5236
+ } break;
5213
5237
  case LLM_ARCH_JAIS:
5214
5238
  {
5215
5239
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5454,6 +5478,12 @@ static void llm_load_vocab(
5454
5478
  } else if (
5455
5479
  tokenizer_pre == "codeshell") {
5456
5480
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
5481
+ } else if (
5482
+ tokenizer_pre == "bloom") {
5483
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
5484
+ } else if (
5485
+ tokenizer_pre == "gpt3-finnish") {
5486
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
5457
5487
  } else {
5458
5488
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
5459
5489
  }
@@ -5597,6 +5627,7 @@ static void llm_load_vocab(
5597
5627
  { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
5598
5628
  { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
5599
5629
  { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
5630
+ { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
5600
5631
  };
5601
5632
 
5602
5633
  for (const auto & it : special_token_types) {
@@ -5649,6 +5680,17 @@ static void llm_load_vocab(
5649
5680
  }
5650
5681
  }
5651
5682
  }
5683
+
5684
+ // find EOM token: "<|eom_id|>"
5685
+ //
5686
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
5687
+ // for now, we apply this workaround to find the EOM token based on its text
5688
+ if (vocab.special_eom_id == -1) {
5689
+ const auto & t = vocab.token_to_id.find("<|eom_id|>");
5690
+ if (t != vocab.token_to_id.end()) {
5691
+ vocab.special_eom_id = t->second;
5692
+ }
5693
+ }
5652
5694
  }
5653
5695
 
5654
5696
  // build special tokens cache
@@ -7432,6 +7474,42 @@ static bool llm_load_tensors(
7432
7474
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
7433
7475
  }
7434
7476
  } break;
7477
+ case LLM_ARCH_T5ENCODER:
7478
+ {
7479
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
7480
+
7481
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7482
+
7483
+ // output
7484
+ {
7485
+ model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
7486
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
7487
+ // if output is NULL, init from the input tok embed
7488
+ if (model.output == NULL) {
7489
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
7490
+ }
7491
+ }
7492
+
7493
+ for (int i = 0; i < n_layer; ++i) {
7494
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7495
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7496
+
7497
+ auto & layer = model.layers[i];
7498
+
7499
+ layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
7500
+ layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
7501
+
7502
+ layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
7503
+ layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
7504
+ layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
7505
+ layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
7506
+
7507
+ layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
7508
+ layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
7509
+ layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
7510
+ layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
7511
+ }
7512
+ } break;
7435
7513
  case LLM_ARCH_JAIS:
7436
7514
  {
7437
7515
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -13146,7 +13224,7 @@ struct llm_build_context {
13146
13224
  return gf;
13147
13225
  }
13148
13226
 
13149
- struct lm_ggml_cgraph * build_t5() {
13227
+ struct lm_ggml_cgraph * build_t5_encoder() {
13150
13228
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13151
13229
 
13152
13230
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -13161,303 +13239,323 @@ struct llm_build_context {
13161
13239
 
13162
13240
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13163
13241
 
13164
- if (lctx.is_encoding) {
13165
- struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
13242
+ LM_GGML_ASSERT(lctx.is_encoding);
13243
+ struct lm_ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
13166
13244
 
13167
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13168
- struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
13169
-
13170
- for (int il = 0; il < n_layer; ++il) {
13171
- struct lm_ggml_tensor * inpSA = inpL;
13245
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13246
+ struct lm_ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
13172
13247
 
13173
- // norm
13174
- cur = llm_build_norm(ctx0, inpL, hparams,
13175
- model.layers[il].attn_norm_enc, NULL,
13176
- LLM_NORM_RMS, cb, il);
13177
- cb(cur, "attn_norm", il);
13248
+ for (int il = 0; il < n_layer; ++il) {
13249
+ struct lm_ggml_tensor * inpSA = inpL;
13178
13250
 
13179
- // self-attention
13180
- {
13181
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
13182
- cb(Qcur, "Qcur", il);
13251
+ // norm
13252
+ cur = llm_build_norm(ctx0, inpL, hparams,
13253
+ model.layers[il].attn_norm_enc, NULL,
13254
+ LLM_NORM_RMS, cb, il);
13255
+ cb(cur, "attn_norm", il);
13183
13256
 
13184
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
13185
- cb(Kcur, "Kcur", il);
13257
+ // self-attention
13258
+ {
13259
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
13260
+ cb(Qcur, "Qcur", il);
13186
13261
 
13187
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
13188
- cb(Vcur, "Vcur", il);
13262
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
13263
+ cb(Kcur, "Kcur", il);
13189
13264
 
13190
- Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13191
- Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13265
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
13266
+ cb(Vcur, "Vcur", il);
13192
13267
 
13193
- struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13194
- struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13268
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13269
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13195
13270
 
13196
- struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13197
- cb(kq, "kq", il);
13271
+ struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13272
+ struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13198
13273
 
13199
- struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
13200
- struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
13201
- struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13202
- cb(kq_b, "kq_b", il);
13274
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13275
+ cb(kq, "kq", il);
13203
13276
 
13204
- kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
13205
- cb(kq, "kq_soft_max_ext", il);
13277
+ struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
13278
+ struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
13279
+ struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13280
+ cb(kq_b, "kq_b", il);
13206
13281
 
13207
- struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
13208
- cb(v, "v", il);
13282
+ kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
13283
+ cb(kq, "kq_soft_max_ext", il);
13209
13284
 
13210
- struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
13211
- cb(kqv, "kqv", il);
13285
+ struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
13286
+ cb(v, "v", il);
13212
13287
 
13213
- struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13214
- cb(kqv_merged, "kqv_merged", il);
13288
+ struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
13289
+ cb(kqv, "kqv", il);
13215
13290
 
13216
- cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13217
- cb(cur, "kqv_merged_cont", il);
13291
+ struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13292
+ cb(kqv_merged, "kqv_merged", il);
13218
13293
 
13219
- lm_ggml_build_forward_expand(gf, cur);
13294
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13295
+ cb(cur, "kqv_merged_cont", il);
13220
13296
 
13221
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
13222
- cb(cur, "kqv_out", il);
13223
- }
13297
+ lm_ggml_build_forward_expand(gf, cur);
13224
13298
 
13225
- if (il == n_layer - 1) {
13226
- // skip computing output for unused tokens
13227
- struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13228
- n_tokens = n_outputs;
13229
- cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13230
- inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13231
- }
13299
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
13300
+ cb(cur, "kqv_out", il);
13301
+ }
13232
13302
 
13233
- struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13234
- cb(ffn_inp, "ffn_inp", il);
13303
+ if (il == n_layer - 1) {
13304
+ // skip computing output for unused tokens
13305
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13306
+ n_tokens = n_outputs;
13307
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13308
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13309
+ }
13235
13310
 
13236
- // feed-forward network
13237
- {
13238
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13239
- model.layers[il].ffn_norm_enc, NULL,
13240
- LLM_NORM_RMS, cb, il);
13241
- cb(cur, "ffn_norm", il);
13311
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13312
+ cb(ffn_inp, "ffn_inp", il);
13242
13313
 
13243
- // T5 uses relu, flan-T5 uses gelu-gated
13244
- cur = llm_build_ffn(ctx0, lctx, cur,
13245
- model.layers[il].ffn_up_enc, NULL, NULL,
13246
- model.layers[il].ffn_gate_enc, NULL, NULL,
13247
- model.layers[il].ffn_down_enc, NULL, NULL,
13248
- NULL,
13249
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13250
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13251
- cb, il);
13252
- cb(cur, "ffn_out", il);
13253
- }
13314
+ // feed-forward network
13315
+ {
13316
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13317
+ model.layers[il].ffn_norm_enc, NULL,
13318
+ LLM_NORM_RMS, cb, il);
13319
+ cb(cur, "ffn_norm", il);
13254
13320
 
13255
- cur = lm_ggml_add(ctx0, cur, ffn_inp);
13321
+ // T5 uses relu, flan-T5 uses gelu-gated
13322
+ cur = llm_build_ffn(ctx0, lctx, cur,
13323
+ model.layers[il].ffn_up_enc, NULL, NULL,
13324
+ model.layers[il].ffn_gate_enc, NULL, NULL,
13325
+ model.layers[il].ffn_down_enc, NULL, NULL,
13326
+ NULL,
13327
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13328
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13329
+ cb, il);
13256
13330
  cb(cur, "ffn_out", il);
13331
+ }
13257
13332
 
13258
- lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13259
- if (layer_dir != nullptr) {
13260
- cur = lm_ggml_add(ctx0, cur, layer_dir);
13261
- }
13262
- cb(cur, "l_out", il);
13333
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13334
+ cb(cur, "ffn_out", il);
13263
13335
 
13264
- // input for next layer
13265
- inpL = cur;
13336
+ lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13337
+ if (layer_dir != nullptr) {
13338
+ cur = lm_ggml_add(ctx0, cur, layer_dir);
13266
13339
  }
13340
+ cb(cur, "l_out", il);
13267
13341
 
13268
- cur = inpL;
13269
- cb(cur, "result_embd", -1);
13342
+ // input for next layer
13343
+ inpL = cur;
13344
+ }
13270
13345
 
13271
- cur = llm_build_norm(ctx0, cur, hparams,
13272
- model.output_norm_enc, NULL,
13273
- LLM_NORM_RMS, cb, -1);
13274
- cb(cur, "result_norm", -1);
13275
- } else {
13276
- LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
13346
+ cur = inpL;
13347
+ cb(cur, "result_embd", -1);
13277
13348
 
13278
- struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc();
13279
- struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
13349
+ cur = llm_build_norm(ctx0, cur, hparams,
13350
+ model.output_norm_enc, NULL,
13351
+ LLM_NORM_RMS, cb, -1);
13352
+ cb(cur, "result_norm", -1);
13280
13353
 
13281
- struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
13282
- struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
13354
+ lm_ggml_build_forward_expand(gf, cur);
13283
13355
 
13284
- for (int il = 0; il < n_layer; ++il) {
13285
- struct lm_ggml_tensor * inpSA = inpL;
13356
+ return gf;
13357
+ }
13286
13358
 
13287
- // norm
13288
- cur = llm_build_norm(ctx0, inpL, hparams,
13289
- model.layers[il].attn_norm, NULL,
13290
- LLM_NORM_RMS, cb, il);
13291
- cb(cur, "attn_norm", il);
13359
+ struct lm_ggml_cgraph * build_t5_decoder() {
13360
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13292
13361
 
13293
- // self-attention
13294
- {
13295
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
13296
- cb(Qcur, "Qcur", il);
13362
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
13363
+ int32_t n_tokens = this->n_tokens;
13297
13364
 
13298
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
13299
- cb(Kcur, "Kcur", il);
13365
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13366
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
13367
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13300
13368
 
13301
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
13302
- cb(Vcur, "Vcur", il);
13369
+ struct lm_ggml_tensor * cur;
13370
+ struct lm_ggml_tensor * inpL;
13303
13371
 
13304
- llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
13372
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13305
13373
 
13306
- struct lm_ggml_tensor * k =
13307
- lm_ggml_view_3d(ctx0, kv_self.k_l[il],
13308
- n_embd_head_k, n_kv, n_head_kv,
13309
- lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
13310
- lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
13311
- 0);
13312
- cb(k, "k", il);
13374
+ LM_GGML_ASSERT(!lctx.is_encoding);
13375
+ LM_GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
13313
13376
 
13314
- struct lm_ggml_tensor * v =
13315
- lm_ggml_view_3d(ctx0, kv_self.v_l[il],
13316
- n_kv, n_embd_head_v, n_head_kv,
13317
- lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
13318
- lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
13319
- 0);
13320
- cb(v, "v", il);
13377
+ struct lm_ggml_tensor * embd_enc = llm_build_inp_embd_enc();
13378
+ struct lm_ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
13321
13379
 
13322
- Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13380
+ struct lm_ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
13381
+ struct lm_ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
13382
+
13383
+ for (int il = 0; il < n_layer; ++il) {
13384
+ struct lm_ggml_tensor * inpSA = inpL;
13323
13385
 
13324
- struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13386
+ // norm
13387
+ cur = llm_build_norm(ctx0, inpL, hparams,
13388
+ model.layers[il].attn_norm, NULL,
13389
+ LLM_NORM_RMS, cb, il);
13390
+ cb(cur, "attn_norm", il);
13325
13391
 
13326
- struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13327
- cb(kq, "kq", il);
13392
+ // self-attention
13393
+ {
13394
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13395
+ cb(Qcur, "Qcur", il);
13328
13396
 
13329
- struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
13330
- struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
13331
- struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13332
- cb(kq_b, "kq_b", il);
13397
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13398
+ cb(Kcur, "Kcur", il);
13333
13399
 
13334
- kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
13335
- cb(kq, "kq_soft_max_ext", il);
13400
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13401
+ cb(Vcur, "Vcur", il);
13336
13402
 
13337
- struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
13338
- cb(kqv, "kqv", il);
13403
+ llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
13339
13404
 
13340
- struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13341
- cb(kqv_merged, "kqv_merged", il);
13405
+ struct lm_ggml_tensor * k =
13406
+ lm_ggml_view_3d(ctx0, kv_self.k_l[il],
13407
+ n_embd_head_k, n_kv, n_head_kv,
13408
+ lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
13409
+ lm_ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
13410
+ 0);
13411
+ cb(k, "k", il);
13342
13412
 
13343
- cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13344
- cb(cur, "kqv_merged_cont", il);
13413
+ struct lm_ggml_tensor * v =
13414
+ lm_ggml_view_3d(ctx0, kv_self.v_l[il],
13415
+ n_kv, n_embd_head_v, n_head_kv,
13416
+ lm_ggml_element_size(kv_self.v_l[il])*n_ctx,
13417
+ lm_ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
13418
+ 0);
13419
+ cb(v, "v", il);
13345
13420
 
13346
- lm_ggml_build_forward_expand(gf, cur);
13421
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13347
13422
 
13348
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur);
13349
- cb(cur, "kqv_out", il);
13350
- }
13423
+ struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13351
13424
 
13352
- cur = lm_ggml_add(ctx0, cur, inpSA);
13353
- cb(cur, "cross_inp", il);
13425
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13426
+ cb(kq, "kq", il);
13354
13427
 
13355
- struct lm_ggml_tensor * inpCA = cur;
13428
+ struct lm_ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
13429
+ struct lm_ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
13430
+ struct lm_ggml_tensor * kq_b = lm_ggml_add(ctx0, kq, pos_bias);
13431
+ cb(kq_b, "kq_b", il);
13356
13432
 
13357
- // norm
13358
- cur = llm_build_norm(ctx0, cur, hparams,
13359
- model.layers[il].attn_norm_cross, NULL,
13360
- LLM_NORM_RMS, cb, il);
13361
- cb(cur, "attn_norm_cross", il);
13433
+ kq = lm_ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
13434
+ cb(kq, "kq_soft_max_ext", il);
13362
13435
 
13363
- // cross-attention
13364
- {
13365
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
13366
- cb(Qcur, "Qcur", il);
13436
+ struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
13437
+ cb(kqv, "kqv", il);
13367
13438
 
13368
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
13369
- cb(Kcur, "Kcur", il);
13439
+ struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13440
+ cb(kqv_merged, "kqv_merged", il);
13370
13441
 
13371
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
13372
- cb(Vcur, "Vcur", il);
13442
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13443
+ cb(cur, "kqv_merged_cont", il);
13373
13444
 
13374
- Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13375
- Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
13445
+ lm_ggml_build_forward_expand(gf, cur);
13376
13446
 
13377
- struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13378
- struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13447
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
13448
+ cb(cur, "kqv_out", il);
13449
+ }
13379
13450
 
13380
- struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13381
- cb(kq, "kq", il);
13451
+ cur = lm_ggml_add(ctx0, cur, inpSA);
13452
+ cb(cur, "cross_inp", il);
13382
13453
 
13383
- kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
13384
- cb(kq, "kq_soft_max_ext", il);
13454
+ struct lm_ggml_tensor * inpCA = cur;
13385
13455
 
13386
- struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
13387
- cb(v, "v", il);
13456
+ // norm
13457
+ cur = llm_build_norm(ctx0, cur, hparams,
13458
+ model.layers[il].attn_norm_cross, NULL,
13459
+ LLM_NORM_RMS, cb, il);
13460
+ cb(cur, "attn_norm_cross", il);
13388
13461
 
13389
- struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
13390
- cb(kqv, "kqv", il);
13462
+ // cross-attention
13463
+ {
13464
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
13465
+ cb(Qcur, "Qcur", il);
13391
13466
 
13392
- struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13393
- cb(kqv_merged, "kqv_merged", il);
13467
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
13468
+ cb(Kcur, "Kcur", il);
13394
13469
 
13395
- cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13396
- cb(cur, "kqv_merged_cont", il);
13470
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
13471
+ cb(Vcur, "Vcur", il);
13397
13472
 
13398
- lm_ggml_build_forward_expand(gf, cur);
13473
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13474
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
13399
13475
 
13400
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
13401
- cb(cur, "kqv_out", il);
13402
- }
13476
+ struct lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
13477
+ struct lm_ggml_tensor * k = lm_ggml_cont(ctx0, lm_ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
13403
13478
 
13404
- if (il == n_layer - 1) {
13405
- // skip computing output for unused tokens
13406
- struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13407
- n_tokens = n_outputs;
13408
- cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13409
- inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13410
- inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
13411
- }
13479
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
13480
+ cb(kq, "kq", il);
13412
13481
 
13413
- struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA);
13414
- cb(ffn_inp, "ffn_inp", il);
13482
+ kq = lm_ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
13483
+ cb(kq, "kq_soft_max_ext", il);
13415
13484
 
13416
- // feed-forward network
13417
- {
13418
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13419
- model.layers[il].ffn_norm, NULL,
13420
- LLM_NORM_RMS, cb, il);
13421
- cb(cur, "ffn_norm", il);
13485
+ struct lm_ggml_tensor * v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, lm_ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
13486
+ cb(v, "v", il);
13422
13487
 
13423
- // T5 uses relu, flan-T5 uses gelu-gated
13424
- cur = llm_build_ffn(ctx0, lctx, cur,
13425
- model.layers[il].ffn_up, NULL, NULL,
13426
- model.layers[il].ffn_gate, NULL, NULL,
13427
- model.layers[il].ffn_down, NULL, NULL,
13428
- NULL,
13429
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13430
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13431
- cb, il);
13432
- cb(cur, "ffn_out", il);
13433
- }
13488
+ struct lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, lm_ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
13489
+ cb(kqv, "kqv", il);
13434
13490
 
13435
- cur = lm_ggml_add(ctx0, cur, ffn_inp);
13436
- cb(cur, "ffn_out", il);
13491
+ struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
13492
+ cb(kqv_merged, "kqv_merged", il);
13437
13493
 
13438
- lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13439
- if (layer_dir != nullptr) {
13440
- cur = lm_ggml_add(ctx0, cur, layer_dir);
13441
- }
13442
- cb(cur, "l_out", il);
13494
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
13495
+ cb(cur, "kqv_merged_cont", il);
13443
13496
 
13444
- // input for next layer
13445
- inpL = cur;
13497
+ lm_ggml_build_forward_expand(gf, cur);
13498
+
13499
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
13500
+ cb(cur, "kqv_out", il);
13446
13501
  }
13447
13502
 
13448
- cur = inpL;
13449
- cb(cur, "result_embd", -1);
13503
+ if (il == n_layer - 1) {
13504
+ // skip computing output for unused tokens
13505
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13506
+ n_tokens = n_outputs;
13507
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13508
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13509
+ inpCA = lm_ggml_get_rows(ctx0, inpCA, inp_out_ids);
13510
+ }
13450
13511
 
13451
- cur = llm_build_norm(ctx0, cur, hparams,
13452
- model.output_norm, NULL,
13453
- LLM_NORM_RMS, cb, -1);
13454
- cb(cur, "result_norm", -1);
13512
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpCA);
13513
+ cb(ffn_inp, "ffn_inp", il);
13455
13514
 
13456
- // lm_head
13457
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
13458
- cb(cur, "result_output", -1);
13515
+ // feed-forward network
13516
+ {
13517
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13518
+ model.layers[il].ffn_norm, NULL,
13519
+ LLM_NORM_RMS, cb, il);
13520
+ cb(cur, "ffn_norm", il);
13521
+
13522
+ // T5 uses relu, flan-T5 uses gelu-gated
13523
+ cur = llm_build_ffn(ctx0, lctx, cur,
13524
+ model.layers[il].ffn_up, NULL, NULL,
13525
+ model.layers[il].ffn_gate, NULL, NULL,
13526
+ model.layers[il].ffn_down, NULL, NULL,
13527
+ NULL,
13528
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13529
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13530
+ cb, il);
13531
+ cb(cur, "ffn_out", il);
13532
+ }
13533
+
13534
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13535
+ cb(cur, "ffn_out", il);
13536
+
13537
+ lm_ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
13538
+ if (layer_dir != nullptr) {
13539
+ cur = lm_ggml_add(ctx0, cur, layer_dir);
13540
+ }
13541
+ cb(cur, "l_out", il);
13542
+
13543
+ // input for next layer
13544
+ inpL = cur;
13459
13545
  }
13460
13546
 
13547
+ cur = inpL;
13548
+ cb(cur, "result_embd", -1);
13549
+
13550
+ cur = llm_build_norm(ctx0, cur, hparams,
13551
+ model.output_norm, NULL,
13552
+ LLM_NORM_RMS, cb, -1);
13553
+ cb(cur, "result_norm", -1);
13554
+
13555
+ // lm_head
13556
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13557
+ cb(cur, "result_output", -1);
13558
+
13461
13559
  lm_ggml_build_forward_expand(gf, cur);
13462
13560
 
13463
13561
  return gf;
@@ -13909,7 +14007,15 @@ static struct lm_ggml_cgraph * llama_build_graph(
13909
14007
  } break;
13910
14008
  case LLM_ARCH_T5:
13911
14009
  {
13912
- result = llm.build_t5();
14010
+ if (lctx.is_encoding) {
14011
+ result = llm.build_t5_encoder();
14012
+ } else {
14013
+ result = llm.build_t5_decoder();
14014
+ }
14015
+ } break;
14016
+ case LLM_ARCH_T5ENCODER:
14017
+ {
14018
+ result = llm.build_t5_encoder();
13913
14019
  } break;
13914
14020
  case LLM_ARCH_JAIS:
13915
14021
  {
@@ -14357,7 +14463,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
14357
14463
 
14358
14464
  // TODO: use a per-batch flag for logits presence instead
14359
14465
  const bool has_logits = !cparams.embeddings;
14360
- const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
14466
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
14361
14467
 
14362
14468
  const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
14363
14469
  const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -14628,12 +14734,15 @@ static int llama_decode_internal(
14628
14734
  res = nullptr;
14629
14735
  embd = nullptr;
14630
14736
  } else if (cparams.embeddings) {
14631
- res = nullptr; // do not extract logits for embedding case
14632
- embd = gf->nodes[gf->n_nodes - 1];
14633
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
14634
- embd = gf->nodes[gf->n_nodes - 2];
14737
+ res = nullptr; // do not extract logits for embedding case
14738
+ embd = nullptr;
14739
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
14740
+ if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
14741
+ embd = gf->nodes[i];
14742
+ break;
14743
+ }
14635
14744
  }
14636
- LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
14745
+ LM_GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
14637
14746
  } else {
14638
14747
  embd = nullptr; // do not extract embeddings when not needed
14639
14748
  LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -14840,9 +14949,24 @@ static int llama_encode_internal(
14840
14949
  lm_ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
14841
14950
 
14842
14951
  // the output embeddings after the final encoder normalization
14843
- struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 1];
14952
+ struct lm_ggml_tensor * embd = nullptr;
14844
14953
 
14845
- LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
14954
+ // there are two cases here
14955
+ if (llama_model_has_decoder(&lctx.model)) {
14956
+ // first case is an encoder-decoder T5 model where embeddings are passed to decoder
14957
+ embd = gf->nodes[gf->n_nodes - 1];
14958
+ LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
14959
+ } else {
14960
+ // second case is an encoder-only T5 model
14961
+ if (cparams.embeddings) {
14962
+ // only output embeddings if required
14963
+ embd = gf->nodes[gf->n_nodes - 1];
14964
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
14965
+ embd = gf->nodes[gf->n_nodes - 2];
14966
+ }
14967
+ LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
14968
+ }
14969
+ }
14846
14970
 
14847
14971
  lm_ggml_backend_sched_alloc_graph(lctx.sched, gf);
14848
14972
 
@@ -14855,20 +14979,54 @@ static int llama_encode_internal(
14855
14979
  lm_ggml_backend_t backend_embd = lm_ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
14856
14980
  LM_GGML_ASSERT(backend_embd != nullptr);
14857
14981
 
14858
- // extract token embeddings
14859
- LM_GGML_ASSERT(lctx.embd != nullptr);
14982
+ if (llama_model_has_decoder(&lctx.model)) {
14983
+ lctx.embd_enc.resize(n_tokens*n_embd);
14984
+ float * embd_out = lctx.embd_enc.data();
14860
14985
 
14861
- lctx.embd_enc.resize(n_tokens*n_embd);
14862
- float * embd_out = lctx.embd_enc.data();
14986
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
14863
14987
 
14864
- lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
14988
+ // remember the sequence ids used during the encoding - needed for cross attention later
14989
+ lctx.seq_ids_enc.resize(n_tokens);
14990
+ for (uint32_t i = 0; i < n_tokens; i++) {
14991
+ for (int s = 0; s < batch.n_seq_id[i]; s++) {
14992
+ llama_seq_id seq_id = batch.seq_id[i][s];
14993
+ lctx.seq_ids_enc[i].insert(seq_id);
14994
+ }
14995
+ }
14996
+ } else {
14997
+ LM_GGML_ASSERT(lctx.embd != nullptr);
14865
14998
 
14866
- // remember the sequence ids used during the encoding - needed for cross attention later
14867
- lctx.seq_ids_enc.resize(n_tokens);
14868
- for (uint32_t i = 0; i < n_tokens; i++) {
14869
- for (int s = 0; s < batch.n_seq_id[i]; s++) {
14870
- llama_seq_id seq_id = batch.seq_id[i][s];
14871
- lctx.seq_ids_enc[i].insert(seq_id);
14999
+ switch (cparams.pooling_type) {
15000
+ case LLAMA_POOLING_TYPE_NONE:
15001
+ {
15002
+ // extract token embeddings
15003
+ LM_GGML_ASSERT(lctx.embd != nullptr);
15004
+ float * embd_out = lctx.embd;
15005
+
15006
+ LM_GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
15007
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
15008
+ } break;
15009
+ case LLAMA_POOLING_TYPE_MEAN:
15010
+ case LLAMA_POOLING_TYPE_CLS:
15011
+ case LLAMA_POOLING_TYPE_LAST:
15012
+ {
15013
+ // extract sequence embeddings
15014
+ auto & embd_seq_out = lctx.embd_seq;
15015
+ embd_seq_out.clear();
15016
+
15017
+ for (uint32_t i = 0; i < n_tokens; i++) {
15018
+ const llama_seq_id seq_id = batch.seq_id[i][0];
15019
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
15020
+ continue;
15021
+ }
15022
+ embd_seq_out[seq_id].resize(n_embd);
15023
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
15024
+ }
15025
+ } break;
15026
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
15027
+ {
15028
+ LM_GGML_ABORT("unknown pooling type");
15029
+ }
14872
15030
  }
14873
15031
  }
14874
15032
  }
@@ -15304,7 +15462,7 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
15304
15462
  const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
15305
15463
  auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
15306
15464
  if (n_expert > 1) {
15307
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
15465
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
15308
15466
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
15309
15467
  // for getting the current layer as I initially thought, and we need to resort to parsing the
15310
15468
  // tensor name.
@@ -16578,6 +16736,8 @@ struct llama_context * llama_new_context_with_model(
16578
16736
 
16579
16737
  ctx->sampling.rng = std::mt19937(params.seed);
16580
16738
  ctx->logits_all = params.logits_all;
16739
+ // build worst-case graph for encoder if a model contains encoder
16740
+ ctx->is_encoding = llama_model_has_encoder(model);
16581
16741
 
16582
16742
  uint32_t kv_size = cparams.n_ctx;
16583
16743
  lm_ggml_type type_k = params.type_k;
@@ -16892,6 +17052,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16892
17052
  case LLM_ARCH_MAMBA:
16893
17053
  case LLM_ARCH_JINA_BERT_V2:
16894
17054
  case LLM_ARCH_T5:
17055
+ case LLM_ARCH_T5ENCODER:
16895
17056
  case LLM_ARCH_JAIS:
16896
17057
  return LLAMA_ROPE_TYPE_NONE;
16897
17058
 
@@ -17039,8 +17200,16 @@ struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const
17039
17200
 
17040
17201
  bool llama_model_has_encoder(const struct llama_model * model) {
17041
17202
  switch (model->arch) {
17042
- case LLM_ARCH_T5: return true;
17043
- default: return false;
17203
+ case LLM_ARCH_T5: return true;
17204
+ case LLM_ARCH_T5ENCODER: return true;
17205
+ default: return false;
17206
+ }
17207
+ }
17208
+
17209
+ bool llama_model_has_decoder(const struct llama_model * model) {
17210
+ switch (model->arch) {
17211
+ case LLM_ARCH_T5ENCODER: return false;
17212
+ default: return true;
17044
17213
  }
17045
17214
  }
17046
17215
 
@@ -17343,6 +17512,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
17343
17512
  // TODO: replace all non-fatal assertions with returned errors or exceptions
17344
17513
  struct llama_data_write {
17345
17514
  virtual void write(const void * src, size_t size) = 0;
17515
+ virtual void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) = 0;
17346
17516
  virtual size_t get_size_written() = 0;
17347
17517
  virtual ~llama_data_write() = default;
17348
17518
 
@@ -17465,9 +17635,8 @@ struct llama_data_write {
17465
17635
  // Read each range of cells of k_size length each into tmp_buf and write out
17466
17636
  for (const auto & range : cell_ranges) {
17467
17637
  const size_t range_size = range.second - range.first;
17468
- tmp_buf.resize(range_size * k_size_row);
17469
- lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
17470
- write(tmp_buf.data(), tmp_buf.size());
17638
+ const size_t buf_size = range_size * k_size_row;
17639
+ write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
17471
17640
  }
17472
17641
  }
17473
17642
 
@@ -17486,9 +17655,8 @@ struct llama_data_write {
17486
17655
  // Read each range of cells of v_size length each into tmp_buf and write out
17487
17656
  for (const auto & range : cell_ranges) {
17488
17657
  const size_t range_size = range.second - range.first;
17489
- tmp_buf.resize(range_size * v_size_row);
17490
- lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
17491
- write(tmp_buf.data(), tmp_buf.size());
17658
+ const size_t buf_size = range_size * v_size_row;
17659
+ write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
17492
17660
  }
17493
17661
  }
17494
17662
  } else {
@@ -17514,9 +17682,8 @@ struct llama_data_write {
17514
17682
  for (const auto & range : cell_ranges) {
17515
17683
  const size_t range_size = range.second - range.first;
17516
17684
  const size_t src_offset = (range.first + j * kv_size) * v_size_el;
17517
- tmp_buf.resize(range_size * v_size_el);
17518
- lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
17519
- write(tmp_buf.data(), tmp_buf.size());
17685
+ const size_t buf_size = range_size * v_size_el;
17686
+ write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
17520
17687
  }
17521
17688
  }
17522
17689
  }
@@ -17875,12 +18042,14 @@ struct llama_data_write_dummy : llama_data_write {
17875
18042
 
17876
18043
  llama_data_write_dummy() {}
17877
18044
 
17878
- // TODO: avoid unnecessary calls to lm_ggml_backend_tensor_get in a dummy context
17879
-
17880
18045
  void write(const void * /* src */, size_t size) override {
17881
18046
  size_written += size;
17882
18047
  }
17883
18048
 
18049
+ void write_tensor_data(const struct lm_ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
18050
+ size_written += size;
18051
+ }
18052
+
17884
18053
  size_t get_size_written() override {
17885
18054
  return size_written;
17886
18055
  }
@@ -17903,6 +18072,16 @@ struct llama_data_write_buffer : llama_data_write {
17903
18072
  buf_size -= size;
17904
18073
  }
17905
18074
 
18075
+ void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
18076
+ if (size > buf_size) {
18077
+ throw std::runtime_error("unexpectedly reached end of buffer");
18078
+ }
18079
+ lm_ggml_backend_tensor_get(tensor, ptr, offset, size);
18080
+ ptr += size;
18081
+ size_written += size;
18082
+ buf_size -= size;
18083
+ }
18084
+
17906
18085
  size_t get_size_written() override {
17907
18086
  return size_written;
17908
18087
  }
@@ -17938,6 +18117,7 @@ struct llama_data_read_buffer : llama_data_read {
17938
18117
  struct llama_data_write_file : llama_data_write {
17939
18118
  llama_file * file;
17940
18119
  size_t size_written = 0;
18120
+ std::vector<uint8_t> temp_buffer;
17941
18121
 
17942
18122
  llama_data_write_file(llama_file * f) : file(f) {}
17943
18123
 
@@ -17946,6 +18126,12 @@ struct llama_data_write_file : llama_data_write {
17946
18126
  size_written += size;
17947
18127
  }
17948
18128
 
18129
+ void write_tensor_data(const struct lm_ggml_tensor * tensor, size_t offset, size_t size) override {
18130
+ temp_buffer.resize(size);
18131
+ lm_ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
18132
+ write(temp_buffer.data(), temp_buffer.size());
18133
+ }
18134
+
17949
18135
  size_t get_size_written() override {
17950
18136
  return size_written;
17951
18137
  }
@@ -18530,11 +18716,11 @@ llama_token llama_token_pad(const struct llama_model * model) {
18530
18716
  return llama_token_pad_impl(model->vocab);
18531
18717
  }
18532
18718
 
18533
- int32_t llama_add_bos_token(const struct llama_model * model) {
18719
+ bool llama_add_bos_token(const struct llama_model * model) {
18534
18720
  return llama_add_bos_token_impl(model->vocab);
18535
18721
  }
18536
18722
 
18537
- int32_t llama_add_eos_token(const struct llama_model * model) {
18723
+ bool llama_add_eos_token(const struct llama_model * model) {
18538
18724
  return llama_add_eos_token_impl(model->vocab);
18539
18725
  }
18540
18726