@fugood/llama.node 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +250 -1
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  39. package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
  40. package/src/llama.cpp/ggml/include/ggml.h +56 -2
  41. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  43. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  53. package/src/llama.cpp/include/llama.h +5 -6
  54. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  55. package/src/llama.cpp/src/llama-adapter.h +3 -0
  56. package/src/llama.cpp/src/llama-arch.cpp +28 -4
  57. package/src/llama.cpp/src/llama-arch.h +3 -0
  58. package/src/llama.cpp/src/llama-context.cpp +65 -57
  59. package/src/llama.cpp/src/llama-context.h +1 -1
  60. package/src/llama.cpp/src/llama-graph.cpp +57 -11
  61. package/src/llama.cpp/src/llama-graph.h +8 -0
  62. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  63. package/src/llama.cpp/src/llama-hparams.h +10 -3
  64. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
  65. package/src/llama.cpp/src/llama-kv-cache.h +9 -0
  66. package/src/llama.cpp/src/llama-model.cpp +217 -97
  67. package/src/llama.cpp/src/llama-model.h +0 -1
  68. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  69. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  70. package/src/llama.cpp/src/llama.cpp +53 -10
  71. package/src/anyascii.c +0 -22223
  72. package/src/anyascii.h +0 -42
  73. package/src/tts_utils.cpp +0 -371
  74. package/src/tts_utils.h +0 -103
@@ -1110,7 +1110,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1110
1110
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1111
1111
 
1112
1112
  switch (hparams.n_layer) {
1113
- case 18: type = LLM_TYPE_537M; break;
1113
+ case 18: type = LLM_TYPE_270M; break;
1114
1114
  case 26: type = LLM_TYPE_1B; break;
1115
1115
  case 34: type = LLM_TYPE_4B; break;
1116
1116
  case 48: type = LLM_TYPE_12B; break;
@@ -1142,6 +1142,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1142
1142
  default: type = LLM_TYPE_UNKNOWN;
1143
1143
  }
1144
1144
  } break;
1145
+ case LLM_ARCH_GEMMA_EMBEDDING:
1146
+ {
1147
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1148
+ hparams.set_swa_pattern(6);
1149
+
1150
+ hparams.causal_attn = false; // embeddings do not use causal attention
1151
+ hparams.rope_freq_base_train_swa = 10000.0f;
1152
+ hparams.rope_freq_scale_train_swa = 1.0f;
1153
+
1154
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1155
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1156
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1157
+
1158
+ switch (hparams.n_layer) {
1159
+ case 24: type = LLM_TYPE_0_3B; break;
1160
+ default: type = LLM_TYPE_UNKNOWN;
1161
+ }
1162
+ hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1163
+
1164
+ } break;
1145
1165
  case LLM_ARCH_STARCODER2:
1146
1166
  {
1147
1167
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1522,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1522
1542
  hparams.dec_start_token_id = dec_start_token_id;
1523
1543
  }
1524
1544
 
1545
+ hparams.dec_n_layer = hparams.n_layer;
1546
+ ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1547
+
1525
1548
  switch (hparams.n_layer) {
1526
1549
  case 6: type = LLM_TYPE_60M; break; // t5-small
1527
1550
  case 8: type = LLM_TYPE_80M; break; // flan-t5-small
@@ -3484,6 +3507,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3484
3507
  }
3485
3508
  } break;
3486
3509
  case LLM_ARCH_GEMMA3:
3510
+ case LLM_ARCH_GEMMA_EMBEDDING:
3487
3511
  {
3488
3512
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3489
3513
 
@@ -4393,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4393
4417
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4394
4418
  }
4395
4419
 
4420
+ // n_layer: number of encoder_layers
4421
+ // dec_n_layer: number of decoder_layers
4422
+ const int dec_n_layer = hparams.dec_n_layer;
4423
+ if (dec_n_layer > n_layer) {
4424
+ layers.resize(dec_n_layer);
4425
+ }
4426
+
4427
+ // load encoder layers
4396
4428
  for (int i = 0; i < n_layer; ++i) {
4397
4429
  auto & layer = layers[i];
4398
4430
 
@@ -4408,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4408
4440
  layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4409
4441
  layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4410
4442
  layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4443
+ }
4444
+
4445
+ // load decoder layers
4446
+ for (int i = 0; i < dec_n_layer; ++i) {
4447
+ auto & layer = layers[i];
4411
4448
 
4412
4449
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
4413
4450
  layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
@@ -6906,9 +6943,7 @@ struct llm_build_falcon : public llm_graph_context {
6906
6943
 
6907
6944
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6908
6945
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6909
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6910
-
6911
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6946
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6912
6947
 
6913
6948
  // using mode = 2 for neox mode
6914
6949
  Qcur = ggml_rope_ext(
@@ -7186,9 +7221,7 @@ struct llm_build_dbrx : public llm_graph_context {
7186
7221
 
7187
7222
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7188
7223
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7189
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7190
-
7191
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7224
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7192
7225
 
7193
7226
  Qcur = ggml_rope_ext(
7194
7227
  ctx0, Qcur, inp_pos, nullptr,
@@ -7308,13 +7341,9 @@ struct llm_build_starcoder : public llm_graph_context {
7308
7341
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7309
7342
  cb(cur, "bqkv", il);
7310
7343
 
7311
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7312
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7313
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7314
-
7315
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7316
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7317
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7344
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7345
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7346
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7318
7347
 
7319
7348
  cb(Qcur, "Qcur", il);
7320
7349
  cb(Kcur, "Kcur", il);
@@ -7530,14 +7559,16 @@ struct llm_build_bert : public llm_graph_context {
7530
7559
  cb(cur, "bqkv", il);
7531
7560
  }
7532
7561
 
7533
- Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7534
- Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7535
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7536
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7562
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7563
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7564
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7537
7565
  } else {
7538
7566
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
7539
7567
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7540
7568
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7569
+
7570
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7571
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7541
7572
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7542
7573
  }
7543
7574
 
@@ -7548,8 +7579,6 @@ struct llm_build_bert : public llm_graph_context {
7548
7579
  LLM_NORM, il);
7549
7580
 
7550
7581
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7551
- } else {
7552
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7553
7582
  }
7554
7583
 
7555
7584
  if (model.layers[il].attn_k_norm) {
@@ -7559,8 +7588,6 @@ struct llm_build_bert : public llm_graph_context {
7559
7588
  LLM_NORM, il);
7560
7589
 
7561
7590
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7562
- } else {
7563
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7564
7591
  }
7565
7592
 
7566
7593
  // RoPE
@@ -7706,9 +7733,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7706
7733
 
7707
7734
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7708
7735
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7709
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7710
-
7711
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7736
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7712
7737
 
7713
7738
  // RoPE
7714
7739
  Qcur = ggml_rope_ext(
@@ -7815,13 +7840,9 @@ struct llm_build_bloom : public llm_graph_context {
7815
7840
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7816
7841
  cb(cur, "bqkv", il);
7817
7842
 
7818
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7819
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7820
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7821
-
7822
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7823
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7824
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7843
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7844
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7845
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7825
7846
 
7826
7847
  cb(Qcur, "Qcur", il);
7827
7848
  cb(Kcur, "Kcur", il);
@@ -7937,13 +7958,9 @@ struct llm_build_mpt : public llm_graph_context {
7937
7958
  cb(cur, "wqkv_clamped", il);
7938
7959
  }
7939
7960
 
7940
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7941
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7942
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7943
-
7944
- cb(Qcur, "Qcur", il);
7945
- cb(Kcur, "Kcur", il);
7946
- cb(Vcur, "Vcur", il);
7961
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7962
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7963
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7947
7964
 
7948
7965
  // Q/K Layernorm
7949
7966
  if (model.layers[il].attn_q_norm) {
@@ -7951,26 +7968,16 @@ struct llm_build_mpt : public llm_graph_context {
7951
7968
  model.layers[il].attn_q_norm,
7952
7969
  model.layers[il].attn_q_norm_b,
7953
7970
  LLM_NORM, il);
7954
- cb(Qcur, "Qcur", il);
7955
7971
 
7956
7972
  Kcur = build_norm(Kcur,
7957
7973
  model.layers[il].attn_k_norm,
7958
7974
  model.layers[il].attn_k_norm_b,
7959
7975
  LLM_NORM, il);
7960
- cb(Kcur, "Kcur", il);
7961
7976
 
7962
7977
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7963
7978
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7964
- } else {
7965
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7966
- cb(Qcur, "Qcur", il);
7967
-
7968
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7969
- cb(Kcur, "Kcur", il);
7970
7979
  }
7971
7980
 
7972
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7973
-
7974
7981
  cb(Qcur, "Qcur", il);
7975
7982
  cb(Kcur, "Kcur", il);
7976
7983
  cb(Vcur, "Vcur", il);
@@ -8219,11 +8226,9 @@ struct llm_build_qwen : public llm_graph_context {
8219
8226
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8220
8227
  cb(cur, "bqkv", il);
8221
8228
 
8222
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8229
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8223
8230
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8224
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
8225
-
8226
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8231
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
8227
8232
 
8228
8233
  // using mode = 2 for neox mode
8229
8234
  Qcur = ggml_rope_ext(
@@ -9198,21 +9203,17 @@ struct llm_build_phi2 : public llm_graph_context {
9198
9203
 
9199
9204
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9200
9205
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9201
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9202
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9206
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9203
9207
  } else {
9204
9208
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9205
9209
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9206
9210
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9211
+
9207
9212
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9208
9213
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9209
9214
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9210
9215
  }
9211
9216
 
9212
- cb(Qcur, "Qcur", il);
9213
- cb(Kcur, "Kcur", il);
9214
- cb(Vcur, "Vcur", il);
9215
-
9216
9217
  Qcur = ggml_rope_ext(
9217
9218
  ctx0, Qcur, inp_pos, nullptr,
9218
9219
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9336,21 +9337,17 @@ struct llm_build_phi3 : public llm_graph_context {
9336
9337
 
9337
9338
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
9338
9339
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
9339
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9340
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9340
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9341
9341
  } else {
9342
9342
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9343
9343
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9344
9344
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9345
+
9345
9346
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9346
9347
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9347
9348
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9348
9349
  }
9349
9350
 
9350
- cb(Qcur, "Qcur", il);
9351
- cb(Kcur, "Kcur", il);
9352
- cb(Vcur, "Vcur", il);
9353
-
9354
9351
  Qcur = ggml_rope_ext(
9355
9352
  ctx0, Qcur, inp_pos, rope_factors,
9356
9353
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9600,18 +9597,14 @@ struct llm_build_gpt2 : public llm_graph_context {
9600
9597
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
9601
9598
  cb(cur, "bqkv", il);
9602
9599
 
9603
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9604
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9605
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9600
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9601
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9602
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9606
9603
 
9607
9604
  cb(Qcur, "Qcur", il);
9608
9605
  cb(Kcur, "Kcur", il);
9609
9606
  cb(Vcur, "Vcur", il);
9610
9607
 
9611
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9612
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9613
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9614
-
9615
9608
  cur = build_attn(inp_attn,
9616
9609
  model.layers[il].wo, model.layers[il].bo,
9617
9610
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
@@ -9706,9 +9699,7 @@ struct llm_build_codeshell : public llm_graph_context {
9706
9699
 
9707
9700
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9708
9701
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9709
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9710
-
9711
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9702
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9712
9703
 
9713
9704
  Qcur = ggml_rope_ext(
9714
9705
  ctx0, Qcur, inp_pos, nullptr,
@@ -11045,6 +11036,137 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
11045
11036
  }
11046
11037
  };
11047
11038
 
11039
+ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11040
+ llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11041
+ const int64_t n_embd_head = hparams.n_embd_head_k;
11042
+
11043
+ ggml_tensor * cur;
11044
+ ggml_tensor * inpL;
11045
+
11046
+ inpL = build_inp_embd(model.tok_embd);
11047
+
11048
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
11049
+ if (ubatch.token) {
11050
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
11051
+ cb(inpL, "inp_scaled", -1);
11052
+ }
11053
+
11054
+ // inp_pos - contains the positions
11055
+ ggml_tensor * inp_pos = build_inp_pos();
11056
+
11057
+ // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
11058
+ auto * inp_attn = build_attn_inp_kv_iswa();
11059
+
11060
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11061
+
11062
+ for (int il = 0; il < n_layer; ++il) {
11063
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
11064
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
11065
+
11066
+ // norm
11067
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
11068
+ cb(cur, "attn_norm", il);
11069
+
11070
+ // self-attention
11071
+ {
11072
+ // compute Q and K and RoPE them
11073
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
11074
+ cb(Qcur, "Qcur", il);
11075
+
11076
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
11077
+ cb(Kcur, "Kcur", il);
11078
+
11079
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
11080
+ cb(Vcur, "Vcur", il);
11081
+
11082
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11083
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11084
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11085
+
11086
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
11087
+ cb(Qcur, "Qcur_normed", il);
11088
+
11089
+ Qcur = ggml_rope_ext(
11090
+ ctx0, Qcur, inp_pos, nullptr,
11091
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
11092
+ ext_factor, attn_factor, beta_fast, beta_slow);
11093
+
11094
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
11095
+ cb(Kcur, "Kcur_normed", il);
11096
+
11097
+ Kcur = ggml_rope_ext(
11098
+ ctx0, Kcur, inp_pos, nullptr,
11099
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
11100
+ ext_factor, attn_factor, beta_fast, beta_slow);
11101
+
11102
+ cb(Qcur, "Qcur", il);
11103
+ cb(Kcur, "Kcur", il);
11104
+ cb(Vcur, "Vcur", il);
11105
+
11106
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
11107
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
11108
+
11109
+ cur = build_attn(inp_attn,
11110
+ model.layers[il].wo, NULL,
11111
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
11112
+ }
11113
+
11114
+ if (il == n_layer - 1 && inp_out_ids) {
11115
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11116
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
11117
+ }
11118
+
11119
+ cur = build_norm(cur,
11120
+ model.layers[il].attn_post_norm, NULL,
11121
+ LLM_NORM_RMS, il);
11122
+ cb(cur, "attn_post_norm", il);
11123
+
11124
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
11125
+ cb(sa_out, "sa_out", il);
11126
+
11127
+ cur = build_norm(sa_out,
11128
+ model.layers[il].ffn_norm, NULL,
11129
+ LLM_NORM_RMS, il);
11130
+ cb(cur, "ffn_norm", il);
11131
+
11132
+ // feed-forward network
11133
+ {
11134
+ cur = build_ffn(cur,
11135
+ model.layers[il].ffn_up, NULL, NULL,
11136
+ model.layers[il].ffn_gate, NULL, NULL,
11137
+ model.layers[il].ffn_down, NULL, NULL,
11138
+ NULL,
11139
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
11140
+ cb(cur, "ffn_out", il);
11141
+ }
11142
+
11143
+ cur = build_norm(cur,
11144
+ model.layers[il].ffn_post_norm, NULL,
11145
+ LLM_NORM_RMS, -1);
11146
+ cb(cur, "ffn_post_norm", -1);
11147
+
11148
+ cur = ggml_add(ctx0, cur, sa_out);
11149
+
11150
+ cur = build_cvec(cur, il);
11151
+ cb(cur, "l_out", il);
11152
+
11153
+ // input for next layer
11154
+ inpL = cur;
11155
+ }
11156
+
11157
+ cur = inpL;
11158
+
11159
+ cur = build_norm(cur,
11160
+ model.output_norm, NULL,
11161
+ LLM_NORM_RMS, -1);
11162
+
11163
+ cb(cur, "result_norm", -1);
11164
+ res->t_embd = cur;
11165
+
11166
+ ggml_build_forward_expand(gf, cur);
11167
+ }
11168
+ };
11169
+
11048
11170
  // TODO: move up next to build_starcoder
11049
11171
  struct llm_build_starcoder2 : public llm_graph_context {
11050
11172
  llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -12449,9 +12571,7 @@ struct llm_build_gptneox : public llm_graph_context {
12449
12571
 
12450
12572
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12451
12573
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12452
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12453
-
12454
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12574
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12455
12575
 
12456
12576
  Qcur = ggml_rope_ext(
12457
12577
  ctx0, Qcur, inp_pos, nullptr,
@@ -13405,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context {
13405
13525
 
13406
13526
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13407
13527
 
13408
- for (int il = 0; il < n_layer; ++il) {
13528
+ const int64_t dec_n_layer = hparams.dec_n_layer;
13529
+
13530
+ for (int il = 0; il < dec_n_layer; ++il) {
13409
13531
  ggml_tensor * inpSA = inpL;
13410
13532
 
13411
13533
  // norm
@@ -13496,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13496
13618
  //cb(cur, "kqv_out", il);
13497
13619
  }
13498
13620
 
13499
- if (il == n_layer - 1 && inp_out_ids) {
13621
+ if (il == dec_n_layer - 1 && inp_out_ids) {
13500
13622
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13501
13623
  inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
13502
13624
  }
@@ -13517,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context {
13517
13639
  model.layers[il].ffn_gate, NULL, NULL,
13518
13640
  model.layers[il].ffn_down, NULL, NULL,
13519
13641
  NULL,
13520
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13521
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13642
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
13643
+ model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
13522
13644
  il);
13523
13645
  cb(cur, "ffn_out", il);
13524
13646
  }
@@ -13584,18 +13706,14 @@ struct llm_build_jais : public llm_graph_context {
13584
13706
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
13585
13707
  cb(cur, "bqkv", il);
13586
13708
 
13587
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13588
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13589
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13709
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
13710
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
13711
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13590
13712
 
13591
13713
  cb(Qcur, "Qcur", il);
13592
13714
  cb(Kcur, "Kcur", il);
13593
13715
  cb(Vcur, "Vcur", il);
13594
13716
 
13595
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13596
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13597
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13598
-
13599
13717
  cur = build_attn(inp_attn,
13600
13718
  model.layers[il].wo, model.layers[il].bo,
13601
13719
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
@@ -13707,8 +13825,7 @@ struct llm_build_chatglm : public llm_graph_context {
13707
13825
  }
13708
13826
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13709
13827
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13710
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13711
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13828
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13712
13829
  }
13713
13830
 
13714
13831
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -13841,8 +13958,7 @@ struct llm_build_glm4 : public llm_graph_context {
13841
13958
  }
13842
13959
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13843
13960
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13844
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13845
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13961
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13846
13962
  }
13847
13963
 
13848
13964
  Qcur = ggml_rope_ext(
@@ -17141,16 +17257,14 @@ private:
17141
17257
  const int64_t k_offset = n_embd_head_q * n_head;
17142
17258
  const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
17143
17259
 
17144
- ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
17260
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
17145
17261
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
17146
- ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
17262
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
17147
17263
 
17148
17264
  cb(Qcur, "Qcur", il);
17149
17265
  cb(Kcur, "Kcur", il);
17150
17266
  cb(Vcur, "Vcur", il);
17151
17267
 
17152
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
17153
-
17154
17268
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
17155
17269
  cb(Qcur, "Qcur_normed", il);
17156
17270
 
@@ -18481,6 +18595,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18481
18595
  case LLM_ARCH_NOMIC_BERT_MOE:
18482
18596
  case LLM_ARCH_NEO_BERT:
18483
18597
  case LLM_ARCH_WAVTOKENIZER_DEC:
18598
+ //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
18484
18599
  case LLM_ARCH_DREAM:
18485
18600
  case LLM_ARCH_LLADA:
18486
18601
  {
@@ -18761,6 +18876,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18761
18876
  {
18762
18877
  llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
18763
18878
  } break;
18879
+ case LLM_ARCH_GEMMA_EMBEDDING:
18880
+ {
18881
+ llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
18882
+ } break;
18764
18883
  case LLM_ARCH_STARCODER2:
18765
18884
  {
18766
18885
  llm = std::make_unique<llm_build_starcoder2>(*this, params);
@@ -19161,6 +19280,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
19161
19280
  case LLM_ARCH_GEMMA2:
19162
19281
  case LLM_ARCH_GEMMA3:
19163
19282
  case LLM_ARCH_GEMMA3N:
19283
+ case LLM_ARCH_GEMMA_EMBEDDING:
19164
19284
  case LLM_ARCH_STARCODER2:
19165
19285
  case LLM_ARCH_OPENELM:
19166
19286
  case LLM_ARCH_GPTNEOX:
@@ -39,7 +39,6 @@ enum llm_type {
39
39
  LLM_TYPE_410M,
40
40
  LLM_TYPE_450M,
41
41
  LLM_TYPE_475M,
42
- LLM_TYPE_537M,
43
42
  LLM_TYPE_558M,
44
43
  LLM_TYPE_700M,
45
44
  LLM_TYPE_770M,
@@ -920,7 +920,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
920
920
  new_type = tensor->type;
921
921
  new_data = tensor->data;
922
922
  new_size = ggml_nbytes(tensor);
923
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
923
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
924
924
  } else {
925
925
  const int64_t nelements = ggml_nelements(tensor);
926
926
 
@@ -1037,8 +1037,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
1037
1037
  }
1038
1038
  close_ofstream();
1039
1039
 
1040
- LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1041
- LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1040
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
1041
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
1042
1042
 
1043
1043
  if (qs.n_fallback > 0) {
1044
1044
  LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",