@fugood/llama.node 1.2.0-rc.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1542,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1542
1542
  hparams.dec_start_token_id = dec_start_token_id;
1543
1543
  }
1544
1544
 
1545
+ hparams.dec_n_layer = hparams.n_layer;
1546
+ ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1547
+
1545
1548
  switch (hparams.n_layer) {
1546
1549
  case 6: type = LLM_TYPE_60M; break; // t5-small
1547
1550
  case 8: type = LLM_TYPE_80M; break; // flan-t5-small
@@ -4414,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4414
4417
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4415
4418
  }
4416
4419
 
4420
+ // n_layer: number of encoder_layers
4421
+ // dec_n_layer: number of decoder_layers
4422
+ const int dec_n_layer = hparams.dec_n_layer;
4423
+ if (dec_n_layer > n_layer) {
4424
+ layers.resize(dec_n_layer);
4425
+ }
4426
+
4427
+ // load encoder layers
4417
4428
  for (int i = 0; i < n_layer; ++i) {
4418
4429
  auto & layer = layers[i];
4419
4430
 
@@ -4429,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4429
4440
  layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4430
4441
  layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4431
4442
  layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4443
+ }
4444
+
4445
+ // load decoder layers
4446
+ for (int i = 0; i < dec_n_layer; ++i) {
4447
+ auto & layer = layers[i];
4432
4448
 
4433
4449
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
4434
4450
  layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
@@ -6927,9 +6943,7 @@ struct llm_build_falcon : public llm_graph_context {
6927
6943
 
6928
6944
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6929
6945
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6930
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6931
-
6932
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6946
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6933
6947
 
6934
6948
  // using mode = 2 for neox mode
6935
6949
  Qcur = ggml_rope_ext(
@@ -7207,9 +7221,7 @@ struct llm_build_dbrx : public llm_graph_context {
7207
7221
 
7208
7222
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7209
7223
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7210
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7211
-
7212
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7224
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7213
7225
 
7214
7226
  Qcur = ggml_rope_ext(
7215
7227
  ctx0, Qcur, inp_pos, nullptr,
@@ -7329,13 +7341,9 @@ struct llm_build_starcoder : public llm_graph_context {
7329
7341
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7330
7342
  cb(cur, "bqkv", il);
7331
7343
 
7332
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7333
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7334
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7335
-
7336
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7337
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7338
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7344
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7345
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7346
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7339
7347
 
7340
7348
  cb(Qcur, "Qcur", il);
7341
7349
  cb(Kcur, "Kcur", il);
@@ -7551,14 +7559,16 @@ struct llm_build_bert : public llm_graph_context {
7551
7559
  cb(cur, "bqkv", il);
7552
7560
  }
7553
7561
 
7554
- Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7555
- Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7556
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7557
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7562
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7563
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7564
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7558
7565
  } else {
7559
7566
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
7560
7567
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7561
7568
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7569
+
7570
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7571
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7562
7572
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7563
7573
  }
7564
7574
 
@@ -7569,8 +7579,6 @@ struct llm_build_bert : public llm_graph_context {
7569
7579
  LLM_NORM, il);
7570
7580
 
7571
7581
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7572
- } else {
7573
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7574
7582
  }
7575
7583
 
7576
7584
  if (model.layers[il].attn_k_norm) {
@@ -7580,8 +7588,6 @@ struct llm_build_bert : public llm_graph_context {
7580
7588
  LLM_NORM, il);
7581
7589
 
7582
7590
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7583
- } else {
7584
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7585
7591
  }
7586
7592
 
7587
7593
  // RoPE
@@ -7727,9 +7733,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7727
7733
 
7728
7734
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7729
7735
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7730
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7731
-
7732
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7736
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7733
7737
 
7734
7738
  // RoPE
7735
7739
  Qcur = ggml_rope_ext(
@@ -7836,13 +7840,9 @@ struct llm_build_bloom : public llm_graph_context {
7836
7840
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7837
7841
  cb(cur, "bqkv", il);
7838
7842
 
7839
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7840
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7841
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7842
-
7843
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7844
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7845
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7843
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7844
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7845
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7846
7846
 
7847
7847
  cb(Qcur, "Qcur", il);
7848
7848
  cb(Kcur, "Kcur", il);
@@ -7958,13 +7958,9 @@ struct llm_build_mpt : public llm_graph_context {
7958
7958
  cb(cur, "wqkv_clamped", il);
7959
7959
  }
7960
7960
 
7961
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7962
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7963
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7964
-
7965
- cb(Qcur, "Qcur", il);
7966
- cb(Kcur, "Kcur", il);
7967
- cb(Vcur, "Vcur", il);
7961
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7962
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7963
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7968
7964
 
7969
7965
  // Q/K Layernorm
7970
7966
  if (model.layers[il].attn_q_norm) {
@@ -7972,26 +7968,16 @@ struct llm_build_mpt : public llm_graph_context {
7972
7968
  model.layers[il].attn_q_norm,
7973
7969
  model.layers[il].attn_q_norm_b,
7974
7970
  LLM_NORM, il);
7975
- cb(Qcur, "Qcur", il);
7976
7971
 
7977
7972
  Kcur = build_norm(Kcur,
7978
7973
  model.layers[il].attn_k_norm,
7979
7974
  model.layers[il].attn_k_norm_b,
7980
7975
  LLM_NORM, il);
7981
- cb(Kcur, "Kcur", il);
7982
7976
 
7983
7977
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7984
7978
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7985
- } else {
7986
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7987
- cb(Qcur, "Qcur", il);
7988
-
7989
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7990
- cb(Kcur, "Kcur", il);
7991
7979
  }
7992
7980
 
7993
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7994
-
7995
7981
  cb(Qcur, "Qcur", il);
7996
7982
  cb(Kcur, "Kcur", il);
7997
7983
  cb(Vcur, "Vcur", il);
@@ -8240,11 +8226,9 @@ struct llm_build_qwen : public llm_graph_context {
8240
8226
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8241
8227
  cb(cur, "bqkv", il);
8242
8228
 
8243
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8229
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8244
8230
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8245
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
8246
-
8247
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8231
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
8248
8232
 
8249
8233
  // using mode = 2 for neox mode
8250
8234
  Qcur = ggml_rope_ext(
@@ -9219,21 +9203,17 @@ struct llm_build_phi2 : public llm_graph_context {
9219
9203
 
9220
9204
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9221
9205
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9222
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9223
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9206
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9224
9207
  } else {
9225
9208
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9226
9209
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9227
9210
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9211
+
9228
9212
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9229
9213
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9230
9214
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9231
9215
  }
9232
9216
 
9233
- cb(Qcur, "Qcur", il);
9234
- cb(Kcur, "Kcur", il);
9235
- cb(Vcur, "Vcur", il);
9236
-
9237
9217
  Qcur = ggml_rope_ext(
9238
9218
  ctx0, Qcur, inp_pos, nullptr,
9239
9219
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9357,21 +9337,17 @@ struct llm_build_phi3 : public llm_graph_context {
9357
9337
 
9358
9338
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
9359
9339
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
9360
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9361
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9340
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9362
9341
  } else {
9363
9342
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9364
9343
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9365
9344
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9345
+
9366
9346
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9367
9347
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9368
9348
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9369
9349
  }
9370
9350
 
9371
- cb(Qcur, "Qcur", il);
9372
- cb(Kcur, "Kcur", il);
9373
- cb(Vcur, "Vcur", il);
9374
-
9375
9351
  Qcur = ggml_rope_ext(
9376
9352
  ctx0, Qcur, inp_pos, rope_factors,
9377
9353
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9621,18 +9597,14 @@ struct llm_build_gpt2 : public llm_graph_context {
9621
9597
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
9622
9598
  cb(cur, "bqkv", il);
9623
9599
 
9624
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9625
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9626
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9600
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9601
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9602
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9627
9603
 
9628
9604
  cb(Qcur, "Qcur", il);
9629
9605
  cb(Kcur, "Kcur", il);
9630
9606
  cb(Vcur, "Vcur", il);
9631
9607
 
9632
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9633
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9634
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9635
-
9636
9608
  cur = build_attn(inp_attn,
9637
9609
  model.layers[il].wo, model.layers[il].bo,
9638
9610
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
@@ -9727,9 +9699,7 @@ struct llm_build_codeshell : public llm_graph_context {
9727
9699
 
9728
9700
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9729
9701
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9730
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9731
-
9732
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9702
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9733
9703
 
9734
9704
  Qcur = ggml_rope_ext(
9735
9705
  ctx0, Qcur, inp_pos, nullptr,
@@ -12601,9 +12571,7 @@ struct llm_build_gptneox : public llm_graph_context {
12601
12571
 
12602
12572
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12603
12573
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12604
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12605
-
12606
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12574
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12607
12575
 
12608
12576
  Qcur = ggml_rope_ext(
12609
12577
  ctx0, Qcur, inp_pos, nullptr,
@@ -13557,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context {
13557
13525
 
13558
13526
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13559
13527
 
13560
- for (int il = 0; il < n_layer; ++il) {
13528
+ const int64_t dec_n_layer = hparams.dec_n_layer;
13529
+
13530
+ for (int il = 0; il < dec_n_layer; ++il) {
13561
13531
  ggml_tensor * inpSA = inpL;
13562
13532
 
13563
13533
  // norm
@@ -13648,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13648
13618
  //cb(cur, "kqv_out", il);
13649
13619
  }
13650
13620
 
13651
- if (il == n_layer - 1 && inp_out_ids) {
13621
+ if (il == dec_n_layer - 1 && inp_out_ids) {
13652
13622
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13653
13623
  inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
13654
13624
  }
@@ -13669,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context {
13669
13639
  model.layers[il].ffn_gate, NULL, NULL,
13670
13640
  model.layers[il].ffn_down, NULL, NULL,
13671
13641
  NULL,
13672
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13673
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13642
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
13643
+ model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
13674
13644
  il);
13675
13645
  cb(cur, "ffn_out", il);
13676
13646
  }
@@ -13736,18 +13706,14 @@ struct llm_build_jais : public llm_graph_context {
13736
13706
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
13737
13707
  cb(cur, "bqkv", il);
13738
13708
 
13739
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13740
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13741
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13709
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
13710
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
13711
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13742
13712
 
13743
13713
  cb(Qcur, "Qcur", il);
13744
13714
  cb(Kcur, "Kcur", il);
13745
13715
  cb(Vcur, "Vcur", il);
13746
13716
 
13747
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13748
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13749
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13750
-
13751
13717
  cur = build_attn(inp_attn,
13752
13718
  model.layers[il].wo, model.layers[il].bo,
13753
13719
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
@@ -13859,8 +13825,7 @@ struct llm_build_chatglm : public llm_graph_context {
13859
13825
  }
13860
13826
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13861
13827
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13862
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13863
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13828
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13864
13829
  }
13865
13830
 
13866
13831
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -13993,8 +13958,7 @@ struct llm_build_glm4 : public llm_graph_context {
13993
13958
  }
13994
13959
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13995
13960
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13996
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13997
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13961
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13998
13962
  }
13999
13963
 
14000
13964
  Qcur = ggml_rope_ext(
@@ -17293,16 +17257,14 @@ private:
17293
17257
  const int64_t k_offset = n_embd_head_q * n_head;
17294
17258
  const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
17295
17259
 
17296
- ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
17260
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
17297
17261
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
17298
- ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
17262
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
17299
17263
 
17300
17264
  cb(Qcur, "Qcur", il);
17301
17265
  cb(Kcur, "Kcur", il);
17302
17266
  cb(Vcur, "Vcur", il);
17303
17267
 
17304
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
17305
-
17306
17268
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
17307
17269
  cb(Qcur, "Qcur_normed", il);
17308
17270
 
@@ -920,7 +920,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
920
920
  new_type = tensor->type;
921
921
  new_data = tensor->data;
922
922
  new_size = ggml_nbytes(tensor);
923
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
923
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
924
924
  } else {
925
925
  const int64_t nelements = ggml_nelements(tensor);
926
926
 
@@ -1037,8 +1037,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
1037
1037
  }
1038
1038
  close_ofstream();
1039
1039
 
1040
- LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1041
- LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1040
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
1041
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
1042
1042
 
1043
1043
  if (qs.n_fallback > 0) {
1044
1044
  LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
59
59
 
60
60
  bool llama_supports_gpu_offload(void) {
61
61
  return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
62
63
  llama_supports_rpc();
63
64
  }
64
65
 
@@ -83,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
83
84
  GGML_ASSERT(dev && "CPU backend is not loaded");
84
85
  auto * reg = ggml_backend_dev_backend_reg(dev);
85
86
  auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
86
- numa_init_fn(numa);
87
+ if (numa_init_fn) {
88
+ numa_init_fn(numa);
89
+ }
87
90
  }
88
91
  }
89
92
 
@@ -182,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
182
185
  model->devices.push_back(*dev);
183
186
  }
184
187
  } else {
188
+ // default device selection
189
+
190
+ // build list of available devices
191
+ std::vector<ggml_backend_dev_t> gpus;
192
+ std::vector<ggml_backend_dev_t> igpus;
185
193
  std::vector<ggml_backend_dev_t> rpc_servers;
186
- // use all available devices
194
+
187
195
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
188
196
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
189
197
  switch (ggml_backend_dev_type(dev)) {
@@ -192,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
192
200
  // skip CPU backends since they are handled separately
193
201
  break;
194
202
 
195
- case GGML_BACKEND_DEVICE_TYPE_GPU:
203
+ case GGML_BACKEND_DEVICE_TYPE_GPU: {
196
204
  ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
197
205
  if (ggml_backend_reg_name(reg) == std::string("RPC")) {
198
206
  rpc_servers.push_back(dev);
199
207
  } else {
200
- model->devices.push_back(dev);
208
+ // check if there is already a GPU with the same device id
209
+ ggml_backend_dev_props props;
210
+ ggml_backend_dev_get_props(dev, &props);
211
+ auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
212
+ ggml_backend_dev_props d_props;
213
+ ggml_backend_dev_get_props(d, &d_props);
214
+ if (props.device_id && d_props.device_id) {
215
+ return strcmp(props.device_id, d_props.device_id) == 0;
216
+ }
217
+ return false;
218
+ });
219
+
220
+ if (it != gpus.end()) {
221
+ LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
222
+ __func__,
223
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
224
+ props.device_id ? props.device_id : "unknown id",
225
+ ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
226
+ } else {
227
+ gpus.push_back(dev);
228
+ }
201
229
  }
202
230
  break;
231
+ }
232
+
233
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
234
+ igpus.push_back(dev);
235
+ break;
203
236
  }
204
237
  }
205
- // add RPC servers at the front of the list
206
- if (!rpc_servers.empty()) {
207
- model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
238
+
239
+ // add RPC servers at the front of the list to minimize network transfers
240
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
241
+
242
+ // add GPUs
243
+ model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
244
+
245
+ // add integrated GPUs only if no other devices were found
246
+ if (model->devices.empty()) {
247
+ model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
208
248
  }
209
249
  }
210
250
 
@@ -225,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
225
265
  }
226
266
 
227
267
  for (auto * dev : model->devices) {
228
- size_t free, total; // NOLINT
229
- ggml_backend_dev_memory(dev, &free, &total);
230
- LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
268
+ ggml_backend_dev_props props;
269
+ ggml_backend_dev_get_props(dev, &props);
270
+ LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
271
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
272
+ props.device_id ? props.device_id : "unknown id",
273
+ props.memory_free/1024/1024);
231
274
  }
232
275
 
233
276
  const int status = llama_model_load(path_model, splits, *model, params);