llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -197,6 +197,7 @@ enum llm_arch {
197
197
  LLM_ARCH_PERSIMMON,
198
198
  LLM_ARCH_REFACT,
199
199
  LLM_ARCH_BERT,
200
+ LLM_ARCH_NOMIC_BERT,
200
201
  LLM_ARCH_BLOOM,
201
202
  LLM_ARCH_STABLELM,
202
203
  LLM_ARCH_QWEN,
@@ -207,31 +208,34 @@ enum llm_arch {
207
208
  LLM_ARCH_ORION,
208
209
  LLM_ARCH_INTERNLM2,
209
210
  LLM_ARCH_MINICPM,
211
+ LLM_ARCH_GEMMA,
210
212
  LLM_ARCH_UNKNOWN,
211
213
  };
212
214
 
213
215
  static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
214
- { LLM_ARCH_LLAMA, "llama" },
215
- { LLM_ARCH_FALCON, "falcon" },
216
- { LLM_ARCH_GPT2, "gpt2" },
217
- { LLM_ARCH_GPTJ, "gptj" },
218
- { LLM_ARCH_GPTNEOX, "gptneox" },
219
- { LLM_ARCH_MPT, "mpt" },
220
- { LLM_ARCH_BAICHUAN, "baichuan" },
221
- { LLM_ARCH_STARCODER, "starcoder" },
222
- { LLM_ARCH_PERSIMMON, "persimmon" },
223
- { LLM_ARCH_REFACT, "refact" },
224
- { LLM_ARCH_BERT, "bert" },
225
- { LLM_ARCH_BLOOM, "bloom" },
226
- { LLM_ARCH_STABLELM, "stablelm" },
227
- { LLM_ARCH_QWEN, "qwen" },
228
- { LLM_ARCH_QWEN2, "qwen2" },
229
- { LLM_ARCH_PHI2, "phi2" },
230
- { LLM_ARCH_PLAMO, "plamo" },
231
- { LLM_ARCH_CODESHELL, "codeshell" },
232
- { LLM_ARCH_ORION, "orion" },
233
- { LLM_ARCH_INTERNLM2, "internlm2" },
234
- { LLM_ARCH_MINICPM, "minicpm" },
216
+ { LLM_ARCH_LLAMA, "llama" },
217
+ { LLM_ARCH_FALCON, "falcon" },
218
+ { LLM_ARCH_GPT2, "gpt2" },
219
+ { LLM_ARCH_GPTJ, "gptj" },
220
+ { LLM_ARCH_GPTNEOX, "gptneox" },
221
+ { LLM_ARCH_MPT, "mpt" },
222
+ { LLM_ARCH_BAICHUAN, "baichuan" },
223
+ { LLM_ARCH_STARCODER, "starcoder" },
224
+ { LLM_ARCH_PERSIMMON, "persimmon" },
225
+ { LLM_ARCH_REFACT, "refact" },
226
+ { LLM_ARCH_BERT, "bert" },
227
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
228
+ { LLM_ARCH_BLOOM, "bloom" },
229
+ { LLM_ARCH_STABLELM, "stablelm" },
230
+ { LLM_ARCH_QWEN, "qwen" },
231
+ { LLM_ARCH_QWEN2, "qwen2" },
232
+ { LLM_ARCH_PHI2, "phi2" },
233
+ { LLM_ARCH_PLAMO, "plamo" },
234
+ { LLM_ARCH_CODESHELL, "codeshell" },
235
+ { LLM_ARCH_ORION, "orion" },
236
+ { LLM_ARCH_INTERNLM2, "internlm2" },
237
+ { LLM_ARCH_MINICPM, "minicpm" },
238
+ { LLM_ARCH_GEMMA, "gemma" },
235
239
  };
236
240
 
237
241
  enum llm_kv {
@@ -254,7 +258,7 @@ enum llm_kv {
254
258
  LLM_KV_TENSOR_DATA_LAYOUT,
255
259
  LLM_KV_EXPERT_COUNT,
256
260
  LLM_KV_EXPERT_USED_COUNT,
257
- LLM_KV_POOLING_LAYER,
261
+ LLM_KV_POOLING_TYPE,
258
262
 
259
263
  LLM_KV_ATTENTION_HEAD_COUNT,
260
264
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -312,7 +316,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
312
316
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
313
317
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
314
318
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
315
- { LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
319
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
316
320
 
317
321
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
318
322
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -375,6 +379,7 @@ enum llm_tensor {
375
379
  LLM_TENSOR_ATTN_OUT,
376
380
  LLM_TENSOR_ATTN_NORM,
377
381
  LLM_TENSOR_ATTN_NORM_2,
382
+ LLM_TENSOR_ATTN_OUT_NORM,
378
383
  LLM_TENSOR_ATTN_ROT_EMBD,
379
384
  LLM_TENSOR_FFN_GATE_INP,
380
385
  LLM_TENSOR_FFN_NORM,
@@ -387,6 +392,7 @@ enum llm_tensor {
387
392
  LLM_TENSOR_FFN_UP_EXP,
388
393
  LLM_TENSOR_ATTN_Q_NORM,
389
394
  LLM_TENSOR_ATTN_K_NORM,
395
+ LLM_TENSOR_LAYER_OUT_NORM,
390
396
  };
391
397
 
392
398
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -503,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
503
509
  {
504
510
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
505
511
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
506
- { LLM_TENSOR_OUTPUT, "output" },
507
512
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
508
513
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
509
514
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -552,12 +557,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
552
557
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
553
558
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
554
559
  { LLM_TENSOR_POS_EMBD, "position_embd" },
555
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
560
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
556
561
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
557
562
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
558
563
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
559
564
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
560
- { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
565
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
566
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
567
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
+ },
569
+ },
570
+ {
571
+ LLM_ARCH_NOMIC_BERT,
572
+ {
573
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
574
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
575
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
576
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
577
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
578
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
579
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
580
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
561
581
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
562
582
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
563
583
  },
@@ -741,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
741
761
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
742
762
  },
743
763
  },
764
+ {
765
+ LLM_ARCH_GEMMA,
766
+ {
767
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
768
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
769
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
770
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
771
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
772
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
773
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
774
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
775
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
776
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
777
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
778
+ },
779
+ },
744
780
  {
745
781
  LLM_ARCH_UNKNOWN,
746
782
  {
@@ -1015,7 +1051,7 @@ struct llama_mmap {
1015
1051
  int fd = fileno(file->fp);
1016
1052
  int flags = MAP_SHARED;
1017
1053
  // prefetch/readahead impairs performance on NUMA systems
1018
- if (numa) { prefetch = 0; }
1054
+ if (numa) { prefetch = 0; }
1019
1055
  #ifdef __linux__
1020
1056
  // advise the kernel to read the file sequentially (increases readahead)
1021
1057
  if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -1485,6 +1521,7 @@ enum e_model {
1485
1521
  MODEL_22M,
1486
1522
  MODEL_33M,
1487
1523
  MODEL_109M,
1524
+ MODEL_137M,
1488
1525
  MODEL_335M,
1489
1526
  MODEL_0_5B,
1490
1527
  MODEL_1B,
@@ -1537,12 +1574,13 @@ struct llama_hparams {
1537
1574
  uint32_t n_yarn_orig_ctx;
1538
1575
  int32_t rope_scaling_type_train;
1539
1576
 
1540
- float f_clamp_kqv;
1541
- float f_max_alibi_bias;
1577
+ float f_clamp_kqv = 0.0f;
1578
+ float f_max_alibi_bias = 0.0f;
1542
1579
 
1543
1580
  bool causal_attn = true;
1544
- bool pooling_layer = false;
1581
+ bool need_kq_pos = false;
1545
1582
 
1583
+ uint32_t pooling_type = LLAMA_POOLING_NONE;
1546
1584
 
1547
1585
  bool operator!=(const llama_hparams & other) const {
1548
1586
  if (this->vocab_only != other.vocab_only) return true;
@@ -1620,6 +1658,8 @@ struct llama_layer {
1620
1658
  struct ggml_tensor * attn_q_norm_b;
1621
1659
  struct ggml_tensor * attn_k_norm;
1622
1660
  struct ggml_tensor * attn_k_norm_b;
1661
+ struct ggml_tensor * attn_out_norm;
1662
+ struct ggml_tensor * attn_out_norm_b;
1623
1663
 
1624
1664
  // attention
1625
1665
  struct ggml_tensor * wq;
@@ -1638,6 +1678,8 @@ struct llama_layer {
1638
1678
  // normalization
1639
1679
  struct ggml_tensor * ffn_norm;
1640
1680
  struct ggml_tensor * ffn_norm_b;
1681
+ struct ggml_tensor * layer_out_norm;
1682
+ struct ggml_tensor * layer_out_norm_b;
1641
1683
 
1642
1684
  // ff
1643
1685
  struct ggml_tensor * ffn_gate; // w1
@@ -1899,8 +1941,10 @@ struct llama_context {
1899
1941
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1900
1942
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1901
1943
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1944
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1902
1945
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1903
- struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1946
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1947
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
1904
1948
 
1905
1949
  #ifdef GGML_USE_MPI
1906
1950
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2499,6 +2543,8 @@ struct llama_model_loader {
2499
2543
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2500
2544
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2501
2545
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
+ case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2502
2548
  default:
2503
2549
  {
2504
2550
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2744,13 +2790,7 @@ struct llama_model_loader {
2744
2790
 
2745
2791
  std::vector<no_init<uint8_t>> read_buf;
2746
2792
 
2747
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2748
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2749
- if (!cur) {
2750
- // some tensors may be allocated in a different context
2751
- continue;
2752
- }
2753
-
2793
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
2754
2794
  if (progress_callback) {
2755
2795
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2756
2796
  return false;
@@ -2848,6 +2888,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2848
2888
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2849
2889
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2850
2890
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
+ case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2851
2893
 
2852
2894
  default: return "unknown, may not work";
2853
2895
  }
@@ -2855,6 +2897,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2855
2897
 
2856
2898
  static const char * llama_model_type_name(e_model type) {
2857
2899
  switch (type) {
2900
+ case MODEL_22M: return "22M";
2901
+ case MODEL_33M: return "33M";
2902
+ case MODEL_109M: return "109M";
2903
+ case MODEL_137M: return "137M";
2904
+ case MODEL_0_5B: return "0.5B";
2858
2905
  case MODEL_1B: return "1B";
2859
2906
  case MODEL_2B: return "2B";
2860
2907
  case MODEL_3B: return "3B";
@@ -3024,6 +3071,11 @@ static void llm_load_hparams(
3024
3071
  case 40: model.type = e_model::MODEL_13B; break;
3025
3072
  default: model.type = e_model::MODEL_UNKNOWN;
3026
3073
  }
3074
+
3075
+ if (model.type == e_model::MODEL_13B) {
3076
+ // TODO: become GGUF KV parameter
3077
+ hparams.f_max_alibi_bias = 8.0f;
3078
+ }
3027
3079
  } break;
3028
3080
  case LLM_ARCH_STARCODER:
3029
3081
  {
@@ -3051,13 +3103,16 @@ static void llm_load_hparams(
3051
3103
  case 32: model.type = e_model::MODEL_1B; break;
3052
3104
  default: model.type = e_model::MODEL_UNKNOWN;
3053
3105
  }
3106
+
3107
+ // TODO: become GGUF KV parameter
3108
+ hparams.f_max_alibi_bias = 8.0f;
3054
3109
  } break;
3055
3110
  case LLM_ARCH_BERT:
3056
3111
  {
3057
3112
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3058
3113
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3059
3114
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3060
- ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3115
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3061
3116
 
3062
3117
  switch (hparams.n_layer) {
3063
3118
  case 3:
@@ -3073,6 +3128,17 @@ static void llm_load_hparams(
3073
3128
  model.type = e_model::MODEL_335M; break; // bge-large
3074
3129
  }
3075
3130
  } break;
3131
+ case LLM_ARCH_NOMIC_BERT:
3132
+ {
3133
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
+
3138
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
+ model.type = e_model::MODEL_137M;
3140
+ }
3141
+ } break;
3076
3142
  case LLM_ARCH_BLOOM:
3077
3143
  {
3078
3144
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3085,11 +3151,12 @@ static void llm_load_hparams(
3085
3151
  case 4096: model.type = e_model::MODEL_7B; break;
3086
3152
  } break;
3087
3153
  }
3154
+
3155
+ // TODO: become GGUF KV parameter
3156
+ hparams.f_max_alibi_bias = 8.0f;
3088
3157
  } break;
3089
3158
  case LLM_ARCH_MPT:
3090
3159
  {
3091
- hparams.f_clamp_kqv = 0.0f;
3092
-
3093
3160
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3094
3161
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
3095
3162
  ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
@@ -3187,10 +3254,24 @@ static void llm_load_hparams(
3187
3254
  default: model.type = e_model::MODEL_UNKNOWN;
3188
3255
  }
3189
3256
  } break;
3257
+ case LLM_ARCH_GEMMA:
3258
+ {
3259
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3260
+
3261
+ switch (hparams.n_layer) {
3262
+ case 18: model.type = e_model::MODEL_2B; break;
3263
+ case 28: model.type = e_model::MODEL_7B; break;
3264
+ default: model.type = e_model::MODEL_UNKNOWN;
3265
+ }
3266
+ } break;
3190
3267
  default: (void)0;
3191
3268
  }
3192
3269
 
3193
3270
  model.ftype = ml.ftype;
3271
+
3272
+ if (hparams.f_max_alibi_bias > 0.0f) {
3273
+ hparams.need_kq_pos = true;
3274
+ }
3194
3275
  }
3195
3276
 
3196
3277
  // TODO: This should probably be in llama.h
@@ -3634,7 +3715,7 @@ static bool llm_load_tensors(
3634
3715
  }
3635
3716
 
3636
3717
  // create one context per buffer type
3637
- size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3718
+ size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
3638
3719
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3639
3720
  for (auto & it : buft_layer_count) {
3640
3721
  struct ggml_init_params params = {
@@ -3772,6 +3853,7 @@ static bool llm_load_tensors(
3772
3853
  } else {
3773
3854
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3774
3855
  ml.n_created--; // artificial tensor
3856
+ ml.size_data += ggml_nbytes(model.output);
3775
3857
  }
3776
3858
  }
3777
3859
 
@@ -3875,10 +3957,14 @@ static bool llm_load_tensors(
3875
3957
  }
3876
3958
  } break;
3877
3959
  case LLM_ARCH_BERT:
3960
+ case LLM_ARCH_NOMIC_BERT:
3878
3961
  {
3879
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3880
- model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3881
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3962
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3963
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3964
+ if (model.arch == LLM_ARCH_BERT) {
3965
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3966
+ }
3967
+
3882
3968
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3883
3969
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3884
3970
 
@@ -3888,29 +3974,38 @@ static bool llm_load_tensors(
3888
3974
 
3889
3975
  auto & layer = model.layers[i];
3890
3976
 
3891
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3892
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3977
+ if (model.arch == LLM_ARCH_BERT) {
3978
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3979
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3893
3980
 
3894
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3895
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3981
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3982
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3896
3983
 
3897
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3898
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3984
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3985
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3986
+ } else {
3987
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3988
+ }
3899
3989
 
3900
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3901
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3990
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3902
3991
 
3903
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3904
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3992
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
3993
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
3905
3994
 
3906
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3907
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3995
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3996
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3908
3997
 
3909
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3910
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3998
+ if (model.arch == LLM_ARCH_BERT) {
3999
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4000
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3911
4001
 
3912
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3913
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4002
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4003
+ } else {
4004
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4005
+ }
4006
+
4007
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4008
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
3914
4009
  }
3915
4010
  } break;
3916
4011
  case LLM_ARCH_BLOOM:
@@ -3958,7 +4053,12 @@ static bool llm_load_tensors(
3958
4053
  // output
3959
4054
  {
3960
4055
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3961
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4056
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4057
+
4058
+ // same as tok_embd, duplicated to allow offloading
4059
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4060
+ ml.n_created--; // artificial tensor
4061
+ ml.size_data += ggml_nbytes(model.output);
3962
4062
  }
3963
4063
 
3964
4064
  for (int i = 0; i < n_layer; ++i) {
@@ -3967,14 +4067,23 @@ static bool llm_load_tensors(
3967
4067
 
3968
4068
  auto & layer = model.layers[i];
3969
4069
 
3970
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4070
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4071
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
3971
4072
 
3972
4073
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4074
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
4075
+
3973
4076
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4077
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
3974
4078
 
3975
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3976
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3977
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4079
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4080
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4081
+
4082
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4083
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
4084
+
4085
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4086
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
3978
4087
 
3979
4088
  // AWQ ScaleActivation layer
3980
4089
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -4287,6 +4396,40 @@ static bool llm_load_tensors(
4287
4396
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4288
4397
  }
4289
4398
  } break;
4399
+ case LLM_ARCH_GEMMA:
4400
+ {
4401
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4402
+
4403
+ // output
4404
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4405
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
4406
+ ml.n_created--; // artificial tensor
4407
+ ml.size_data += ggml_nbytes(model.output);
4408
+
4409
+ const int64_t n_ff = hparams.n_ff;
4410
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4411
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4412
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4413
+
4414
+ for (uint32_t i = 0; i < n_layer; ++i) {
4415
+ ggml_context * ctx_layer = ctx_for_layer(i);
4416
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4417
+
4418
+ auto & layer = model.layers[i];
4419
+
4420
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4421
+
4422
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
4423
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
4424
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
4425
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
4426
+
4427
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4428
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4429
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4430
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4431
+ }
4432
+ } break;
4290
4433
  default:
4291
4434
  throw std::runtime_error("unknown architecture");
4292
4435
  }
@@ -4720,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
4720
4863
  struct ggml_tensor * wo_b,
4721
4864
  struct ggml_tensor * q_cur,
4722
4865
  struct ggml_tensor * kq_mask,
4866
+ struct ggml_tensor * kq_pos,
4723
4867
  int64_t n_ctx,
4724
4868
  int32_t n_tokens,
4725
4869
  int32_t n_kv,
4726
- float max_alibi_bias,
4727
4870
  float kq_scale,
4728
4871
  const llm_build_cb & cb,
4729
4872
  int il) {
@@ -4753,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
4753
4896
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4754
4897
  }
4755
4898
 
4756
- if (max_alibi_bias > 0.0f) {
4757
- // temporary branch until we figure out how to handle ggml_alibi through ggml_add
4899
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
4901
+ #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
+ if (hparams.f_max_alibi_bias > 0.0f) {
4758
4904
  kq = ggml_scale(ctx, kq, kq_scale);
4759
4905
  cb(kq, "kq_scaled", il);
4760
4906
 
4761
- if (max_alibi_bias > 0.0f) {
4762
- // TODO: n_head or n_head_kv
4763
- // TODO: K-shift is likely not working
4764
- // TODO: change to ggml_add
4765
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
4766
- cb(kq, "kq_scaled_alibi", il);
4767
- }
4907
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
4908
+ cb(kq, "kq_scaled_alibi", il);
4768
4909
 
4769
4910
  kq = ggml_add(ctx, kq, kq_mask);
4770
4911
  cb(kq, "kq_masked", il);
4771
4912
 
4772
4913
  kq = ggml_soft_max(ctx, kq);
4773
4914
  cb(kq, "kq_soft_max", il);
4774
- } else {
4775
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
4915
+ } else
4916
+ #endif
4917
+ {
4918
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
4776
4919
  cb(kq, "kq_soft_max_ext", il);
4777
4920
  }
4778
4921
 
@@ -4820,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
4820
4963
  struct ggml_tensor * v_cur,
4821
4964
  struct ggml_tensor * q_cur,
4822
4965
  struct ggml_tensor * kq_mask,
4966
+ struct ggml_tensor * kq_pos,
4823
4967
  int64_t n_ctx,
4824
4968
  int32_t n_tokens,
4825
4969
  int32_t kv_head,
4826
4970
  int32_t n_kv,
4827
- float max_alibi_bias,
4828
4971
  float kq_scale,
4829
4972
  const llm_build_cb & cb,
4830
4973
  int il) {
@@ -4838,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
4838
4981
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4839
4982
 
4840
4983
  struct ggml_tensor * cur;
4841
- cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4842
- wo, wo_b,
4843
- q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4984
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4985
+ q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4844
4986
  cb(cur, "kqv_out", il);
4845
4987
 
4846
4988
  return cur;
@@ -4881,7 +5023,7 @@ struct llm_build_context {
4881
5023
  const int32_t n_orig_ctx;
4882
5024
 
4883
5025
  const bool do_rope_shift;
4884
- const bool do_pooling;
5026
+ const uint32_t pooling_type;
4885
5027
 
4886
5028
  const llm_build_cb & cb;
4887
5029
 
@@ -4925,7 +5067,7 @@ struct llm_build_context {
4925
5067
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4926
5068
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4927
5069
  do_rope_shift (worst_case || kv_self.has_shift),
4928
- do_pooling (hparams.pooling_layer && cparams.do_pooling),
5070
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
4929
5071
  cb (cb),
4930
5072
  buf_compute_meta (lctx.buf_compute_meta) {
4931
5073
  // all initializations should be done in init()
@@ -5008,7 +5150,7 @@ struct llm_build_context {
5008
5150
  }
5009
5151
 
5010
5152
  Qcur = ggml_rope_custom(
5011
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5153
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5012
5154
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5013
5155
  ext_factor, attn_factor, beta_fast, beta_slow
5014
5156
  );
@@ -5023,7 +5165,7 @@ struct llm_build_context {
5023
5165
 
5024
5166
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5025
5167
  model.layers[il].wo, model.layers[il].bo,
5026
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5168
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5027
5169
  cb(cur, "kqv_out", il);
5028
5170
  }
5029
5171
 
@@ -5153,6 +5295,10 @@ struct llm_build_context {
5153
5295
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5154
5296
  cb(KQ_mask, "KQ_mask", -1);
5155
5297
 
5298
+ // positions of the tokens in the KV cache
5299
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
+ cb(KQ_pos, "KQ_pos", -1);
5301
+
5156
5302
  // shift the entire K-cache if needed
5157
5303
  if (do_rope_shift) {
5158
5304
  llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
@@ -5201,12 +5347,9 @@ struct llm_build_context {
5201
5347
  cb(Kcur, "Kcur", il);
5202
5348
 
5203
5349
 
5204
- // apply ALiBi for 13B model
5205
- const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
5206
-
5207
5350
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5208
5351
  model.layers[il].wo, NULL,
5209
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5352
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5210
5353
  cb(cur, "kqv_out", il);
5211
5354
  }
5212
5355
 
@@ -5330,7 +5473,7 @@ struct llm_build_context {
5330
5473
 
5331
5474
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5332
5475
  model.layers[il].wo, NULL,
5333
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5476
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5334
5477
  cb(cur, "kqv_out", il);
5335
5478
  }
5336
5479
 
@@ -5429,7 +5572,7 @@ struct llm_build_context {
5429
5572
 
5430
5573
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5431
5574
  model.layers[il].wo, model.layers[il].bo,
5432
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5575
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5433
5576
  cb(cur, "kqv_out", il);
5434
5577
  }
5435
5578
 
@@ -5634,7 +5777,7 @@ struct llm_build_context {
5634
5777
 
5635
5778
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5636
5779
  model.layers[il].wo, model.layers[il].bo,
5637
- Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5780
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5638
5781
  cb(cur, "kqv_out", il);
5639
5782
  }
5640
5783
 
@@ -5696,6 +5839,10 @@ struct llm_build_context {
5696
5839
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5697
5840
  cb(KQ_mask, "KQ_mask", -1);
5698
5841
 
5842
+ // positions of the tokens in the KV cache
5843
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5844
+ cb(KQ_pos, "KQ_pos", -1);
5845
+
5699
5846
  for (int il = 0; il < n_layer; ++il) {
5700
5847
  struct ggml_tensor * inpSA = inpL;
5701
5848
 
@@ -5723,7 +5870,7 @@ struct llm_build_context {
5723
5870
 
5724
5871
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5725
5872
  model.layers[il].wo, NULL,
5726
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5873
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5727
5874
  cb(cur, "kqv_out", il);
5728
5875
  }
5729
5876
 
@@ -5773,6 +5920,7 @@ struct llm_build_context {
5773
5920
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5774
5921
 
5775
5922
  const int64_t n_embd_head = hparams.n_embd_head_v;
5923
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5776
5924
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5777
5925
 
5778
5926
  struct ggml_tensor * cur;
@@ -5781,7 +5929,8 @@ struct llm_build_context {
5781
5929
  // get input vectors with right size
5782
5930
  const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5783
5931
  struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5784
- struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5932
+ struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5933
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5785
5934
 
5786
5935
  // construct input embeddings (token, type, position)
5787
5936
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5789,7 +5938,9 @@ struct llm_build_context {
5789
5938
  // token types are hardcoded to zero ("Sentence A")
5790
5939
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5791
5940
  inpL = ggml_add(ctx0, inpL, type_row0);
5792
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5941
+ if (model.arch == LLM_ARCH_BERT) {
5942
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5943
+ }
5793
5944
  cb(inpL, "inp_embd", -1);
5794
5945
 
5795
5946
  // embed layer norm
@@ -5805,7 +5956,7 @@ struct llm_build_context {
5805
5956
  struct ggml_tensor * cur = inpL;
5806
5957
 
5807
5958
  // self-attention
5808
- {
5959
+ if (model.arch == LLM_ARCH_BERT) {
5809
5960
  struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5810
5961
  cb(Qcur, "Qcur", il);
5811
5962
 
@@ -5820,7 +5971,38 @@ struct llm_build_context {
5820
5971
 
5821
5972
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5822
5973
  model.layers[il].wo, model.layers[il].bo,
5823
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5974
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5975
+ cb(cur, "kqv_out", il);
5976
+ } else {
5977
+ // compute Q and K and RoPE them
5978
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5979
+ cb(cur, "wqkv", il);
5980
+
5981
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5982
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5983
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5984
+
5985
+ cb(Qcur, "Qcur", il);
5986
+ cb(Kcur, "Kcur", il);
5987
+ cb(Vcur, "Vcur", il);
5988
+
5989
+ Qcur = ggml_rope_custom(
5990
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5992
+ ext_factor, attn_factor, beta_fast, beta_slow
5993
+ );
5994
+ cb(Qcur, "Qcur", il);
5995
+
5996
+ Kcur = ggml_rope_custom(
5997
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5999
+ ext_factor, attn_factor, beta_fast, beta_slow
6000
+ );
6001
+ cb(Kcur, "Kcur", il);
6002
+
6003
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6004
+ model.layers[il].wo, model.layers[il].bo,
6005
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5824
6006
  cb(cur, "kqv_out", il);
5825
6007
  }
5826
6008
 
@@ -5828,25 +6010,34 @@ struct llm_build_context {
5828
6010
  cur = ggml_add(ctx0, cur, inpL);
5829
6011
 
5830
6012
  // attention layer norm
5831
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
6013
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
5832
6014
 
5833
6015
  struct ggml_tensor * ffn_inp = cur;
5834
6016
  cb(ffn_inp, "ffn_inp", il);
5835
6017
 
5836
6018
  // feed-forward network
5837
- cur = llm_build_ffn(ctx0, cur,
5838
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5839
- NULL, NULL,
5840
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5841
- NULL,
5842
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6019
+ if (model.arch == LLM_ARCH_BERT) {
6020
+ cur = llm_build_ffn(ctx0, cur,
6021
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6022
+ NULL, NULL,
6023
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6024
+ NULL,
6025
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6026
+ } else {
6027
+ cur = llm_build_ffn(ctx0, cur,
6028
+ model.layers[il].ffn_up, NULL,
6029
+ model.layers[il].ffn_gate, NULL,
6030
+ model.layers[il].ffn_down, NULL,
6031
+ NULL,
6032
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6033
+ }
5843
6034
  cb(cur, "ffn_out", il);
5844
6035
 
5845
6036
  // attentions bypass the intermediate layer
5846
6037
  cur = ggml_add(ctx0, cur, ffn_inp);
5847
6038
 
5848
6039
  // output layer norm
5849
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
6040
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
5850
6041
 
5851
6042
  // input for next layer
5852
6043
  inpL = cur;
@@ -5856,8 +6047,12 @@ struct llm_build_context {
5856
6047
  cur = inpL;
5857
6048
 
5858
6049
  // pooling layer
5859
- if (do_pooling) {
5860
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
6050
+ if (pooling_type == LLAMA_POOLING_MEAN) {
6051
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
+ } else if (pooling_type == LLAMA_POOLING_CLS) {
6053
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
+ } else {
6055
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
5861
6056
  }
5862
6057
  cb(cur, "result_embd", -1);
5863
6058
 
@@ -5883,6 +6078,10 @@ struct llm_build_context {
5883
6078
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5884
6079
  cb(KQ_mask, "KQ_mask", -1);
5885
6080
 
6081
+ // positions of the tokens in the KV cache
6082
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6083
+ cb(KQ_pos, "KQ_pos", -1);
6084
+
5886
6085
  inpL = llm_build_norm(ctx0, inpL, hparams,
5887
6086
  model.tok_norm,
5888
6087
  model.tok_norm_b,
@@ -5916,7 +6115,7 @@ struct llm_build_context {
5916
6115
 
5917
6116
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5918
6117
  model.layers[il].wo, model.layers[il].bo,
5919
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6118
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5920
6119
  cb(cur, "kqv_out", il);
5921
6120
  }
5922
6121
 
@@ -5976,12 +6175,16 @@ struct llm_build_context {
5976
6175
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5977
6176
  cb(KQ_mask, "KQ_mask", -1);
5978
6177
 
6178
+ // positions of the tokens in the KV cache
6179
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6180
+ cb(KQ_pos, "KQ_pos", -1);
6181
+
5979
6182
  for (int il = 0; il < n_layer; ++il) {
5980
6183
  struct ggml_tensor * attn_norm;
5981
6184
 
5982
6185
  attn_norm = llm_build_norm(ctx0, inpL, hparams,
5983
6186
  model.layers[il].attn_norm,
5984
- NULL,
6187
+ model.layers[il].attn_norm_b,
5985
6188
  LLM_NORM, cb, il);
5986
6189
  cb(attn_norm, "attn_norm", il);
5987
6190
 
@@ -5992,6 +6195,11 @@ struct llm_build_context {
5992
6195
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5993
6196
  cb(cur, "wqkv", il);
5994
6197
 
6198
+ if (model.layers[il].bqkv){
6199
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6200
+ cb(cur, "bqkv", il);
6201
+ }
6202
+
5995
6203
  if (hparams.f_clamp_kqv > 0.0f) {
5996
6204
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5997
6205
  cb(cur, "wqkv_clamped", il);
@@ -6008,8 +6216,8 @@ struct llm_build_context {
6008
6216
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6009
6217
 
6010
6218
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6011
- model.layers[il].wo, NULL,
6012
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6219
+ model.layers[il].wo, model.layers[il].bo,
6220
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6013
6221
  cb(cur, "kqv_out", il);
6014
6222
  }
6015
6223
 
@@ -6021,13 +6229,13 @@ struct llm_build_context {
6021
6229
  {
6022
6230
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
6023
6231
  model.layers[il].ffn_norm,
6024
- NULL,
6232
+ model.layers[il].ffn_norm_b,
6025
6233
  LLM_NORM, cb, il);
6026
6234
  cb(cur, "ffn_norm", il);
6027
6235
  cur = llm_build_ffn(ctx0, cur,
6028
- model.layers[il].ffn_up, NULL,
6236
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6029
6237
  NULL, NULL,
6030
- model.layers[il].ffn_down, NULL,
6238
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6031
6239
  model.layers[il].ffn_act,
6032
6240
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6033
6241
  cb(cur, "ffn_out", il);
@@ -6044,7 +6252,7 @@ struct llm_build_context {
6044
6252
 
6045
6253
  cur = llm_build_norm(ctx0, cur, hparams,
6046
6254
  model.output_norm,
6047
- NULL,
6255
+ model.output_norm_b,
6048
6256
  LLM_NORM, cb, -1);
6049
6257
  cb(cur, "result_norm", -1);
6050
6258
 
@@ -6131,7 +6339,7 @@ struct llm_build_context {
6131
6339
 
6132
6340
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6133
6341
  model.layers[il].wo, NULL,
6134
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6342
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6135
6343
  cb(cur, "kqv_out", il);
6136
6344
  }
6137
6345
 
@@ -6246,7 +6454,7 @@ struct llm_build_context {
6246
6454
 
6247
6455
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6248
6456
  model.layers[il].wo, NULL,
6249
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6457
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6250
6458
  cb(cur, "kqv_out", il);
6251
6459
  }
6252
6460
 
@@ -6367,7 +6575,7 @@ struct llm_build_context {
6367
6575
 
6368
6576
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6369
6577
  model.layers[il].wo, model.layers[il].bo,
6370
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6578
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6371
6579
  cb(cur, "kqv_out", il);
6372
6580
  }
6373
6581
 
@@ -6494,7 +6702,7 @@ struct llm_build_context {
6494
6702
 
6495
6703
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6496
6704
  model.layers[il].wo, model.layers[il].bo,
6497
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
6705
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6498
6706
  cb(cur, "kqv_out", il);
6499
6707
  }
6500
6708
 
@@ -6597,7 +6805,7 @@ struct llm_build_context {
6597
6805
 
6598
6806
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6599
6807
  model.layers[il].wo, NULL,
6600
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6601
6809
  cb(cur, "kqv_out", il);
6602
6810
  }
6603
6811
  struct ggml_tensor * sa_out = cur;
@@ -6696,7 +6904,7 @@ struct llm_build_context {
6696
6904
 
6697
6905
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6698
6906
  model.layers[il].wo, model.layers[il].bo,
6699
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6907
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6700
6908
  cb(cur, "kqv_out", il);
6701
6909
  }
6702
6910
 
@@ -6805,7 +7013,7 @@ struct llm_build_context {
6805
7013
 
6806
7014
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6807
7015
  model.layers[il].wo, model.layers[il].bo,
6808
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7016
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6809
7017
  cb(cur, "kqv_out", il);
6810
7018
  }
6811
7019
 
@@ -6923,7 +7131,7 @@ struct llm_build_context {
6923
7131
 
6924
7132
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6925
7133
  model.layers[il].wo, NULL,
6926
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7134
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6927
7135
  cb(cur, "kqv_out", il);
6928
7136
  }
6929
7137
 
@@ -7042,7 +7250,7 @@ struct llm_build_context {
7042
7250
 
7043
7251
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7044
7252
  model.layers[il].wo, model.layers[il].bo,
7045
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7253
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7046
7254
  cb(cur, "kqv_out", il);
7047
7255
  }
7048
7256
 
@@ -7174,7 +7382,7 @@ struct llm_build_context {
7174
7382
 
7175
7383
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7176
7384
  model.layers[il].wo, model.layers[il].bo,
7177
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7385
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7178
7386
  cb(cur, "kqv_out", il);
7179
7387
  }
7180
7388
 
@@ -7233,6 +7441,116 @@ struct llm_build_context {
7233
7441
 
7234
7442
  return gf;
7235
7443
  }
7444
+
7445
+ struct ggml_cgraph * build_gemma() {
7446
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7447
+
7448
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
7449
+
7450
+ struct ggml_tensor * cur;
7451
+ struct ggml_tensor * inpL;
7452
+
7453
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7454
+ cb(inpL, "inp_embd", -1);
7455
+
7456
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7457
+ cb(inpL, "inp_scaled", -1);
7458
+
7459
+ // inp_pos - contains the positions
7460
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7461
+ cb(inp_pos, "inp_pos", -1);
7462
+
7463
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7464
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
+ cb(KQ_mask, "KQ_mask", -1);
7466
+
7467
+ // shift the entire K-cache if needed
7468
+ if (do_rope_shift) {
7469
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
+ }
7471
+
7472
+ for (int il = 0; il < n_layer; ++il) {
7473
+
7474
+ // norm
7475
+ cur = llm_build_norm(ctx0, inpL, hparams,
7476
+ model.layers[il].attn_norm, NULL,
7477
+ LLM_NORM_RMS, cb, il);
7478
+ cb(cur, "attn_norm", il);
7479
+
7480
+ // self-attention
7481
+ {
7482
+ // compute Q and K and RoPE them
7483
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7484
+ cb(Qcur, "Qcur", il);
7485
+
7486
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7487
+ cb(Kcur, "Kcur", il);
7488
+
7489
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7490
+ cb(Vcur, "Vcur", il);
7491
+
7492
+ Qcur = ggml_rope_custom(
7493
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7495
+ ext_factor, attn_factor, beta_fast, beta_slow);
7496
+ cb(Qcur, "Qcur", il);
7497
+
7498
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
7499
+ cb(Qcur, "Qcur_scaled", il);
7500
+
7501
+ Kcur = ggml_rope_custom(
7502
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7504
+ ext_factor, attn_factor, beta_fast, beta_slow);
7505
+ cb(Kcur, "Kcur", il);
7506
+
7507
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
+ model.layers[il].wo, NULL,
7509
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7510
+ cb(cur, "kqv_out", il);
7511
+ }
7512
+
7513
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
7514
+ cb(sa_out, "sa_out", il);
7515
+
7516
+ cur = llm_build_norm(ctx0, sa_out, hparams,
7517
+ model.layers[il].ffn_norm, NULL,
7518
+ LLM_NORM_RMS, cb, il);
7519
+ cb(cur, "ffn_norm", il);
7520
+
7521
+ // feed-forward network
7522
+ {
7523
+ cur = llm_build_ffn(ctx0, cur,
7524
+ model.layers[il].ffn_up, NULL,
7525
+ model.layers[il].ffn_gate, NULL,
7526
+ model.layers[il].ffn_down, NULL,
7527
+ NULL,
7528
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
7529
+ cb(cur, "ffn_out", il);
7530
+ }
7531
+
7532
+ cur = ggml_add(ctx0, cur, sa_out);
7533
+ cb(cur, "l_out", il);
7534
+
7535
+ // input for next layer
7536
+ inpL = cur;
7537
+ }
7538
+
7539
+ cur = inpL;
7540
+
7541
+ cur = llm_build_norm(ctx0, cur, hparams,
7542
+ model.output_norm, NULL,
7543
+ LLM_NORM_RMS, cb, -1);
7544
+ cb(cur, "result_norm", -1);
7545
+
7546
+ // lm_head
7547
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7548
+ cb(cur, "result_output", -1);
7549
+
7550
+ ggml_build_forward_expand(gf, cur);
7551
+
7552
+ return gf;
7553
+ }
7236
7554
  };
7237
7555
 
7238
7556
  static struct ggml_cgraph * llama_build_graph(
@@ -7289,6 +7607,7 @@ static struct ggml_cgraph * llama_build_graph(
7289
7607
  result = llm.build_refact();
7290
7608
  } break;
7291
7609
  case LLM_ARCH_BERT:
7610
+ case LLM_ARCH_NOMIC_BERT:
7292
7611
  {
7293
7612
  result = llm.build_bert();
7294
7613
  } break;
@@ -7340,6 +7659,10 @@ static struct ggml_cgraph * llama_build_graph(
7340
7659
  {
7341
7660
  result = llm.build_minicpm();
7342
7661
  } break;
7662
+ case LLM_ARCH_GEMMA:
7663
+ {
7664
+ result = llm.build_gemma();
7665
+ } break;
7343
7666
  default:
7344
7667
  GGML_ASSERT(false);
7345
7668
  }
@@ -7404,12 +7727,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7404
7727
  }
7405
7728
  }
7406
7729
 
7407
- {
7408
- assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7409
- float * data = (float *) lctx.inp_sum->data;
7730
+ if (hparams.need_kq_pos) {
7731
+ const int64_t n_kv = kv_self.n;
7732
+
7733
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7410
7734
 
7411
- for (int i = 0; i < batch.n_tokens; ++i) {
7412
- data[i] = 1.0f/float(batch.n_tokens);
7735
+ float * data = (float *) lctx.inp_KQ_pos->data;
7736
+
7737
+ for (int i = 0; i < n_kv; ++i) {
7738
+ data[i] = float(lctx.kv_self.cells[i].pos);
7413
7739
  }
7414
7740
  }
7415
7741
 
@@ -7425,17 +7751,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7425
7751
  }
7426
7752
  }
7427
7753
 
7428
- if (hparams.pooling_layer && cparams.do_pooling) {
7754
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7429
7755
  const int64_t n_tokens = batch.n_tokens;
7430
7756
 
7431
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7432
- float * data = (float *) lctx.inp_sum->data;
7757
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7758
+ float * data = (float *) lctx.inp_mean->data;
7433
7759
 
7434
- memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7760
+ memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7435
7761
 
7762
+ std::vector<uint64_t> sum(n_tokens, 0);
7436
7763
  for (int i = 0; i < n_tokens; ++i) {
7437
7764
  const llama_seq_id seq_id = batch.seq_id[i][0];
7438
- data[seq_id*n_tokens + i] = 1.0f;
7765
+ sum[seq_id] += 1;
7766
+ }
7767
+
7768
+ std::vector<float> div(n_tokens, 0.0f);
7769
+ for (int i = 0; i < n_tokens; ++i) {
7770
+ const uint64_t s = sum[i];
7771
+ if (s > 0) {
7772
+ div[i] = 1.0f/float(s);
7773
+ }
7774
+ }
7775
+
7776
+ for (int i = 0; i < n_tokens; ++i) {
7777
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7778
+ data[seq_id*n_tokens + i] = div[seq_id];
7779
+ }
7780
+ }
7781
+
7782
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7783
+ const int64_t n_tokens = batch.n_tokens;
7784
+
7785
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
7786
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
7787
+
7788
+ for (int i = 0; i < n_tokens; ++i) {
7789
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7790
+ const llama_pos pos = batch.pos[i];
7791
+ if (pos == 0) {
7792
+ data[seq_id] = i;
7793
+ }
7439
7794
  }
7440
7795
  }
7441
7796
  }
@@ -10145,25 +10500,28 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10145
10500
  return std::make_pair(i_layer, n_layer);
10146
10501
  };
10147
10502
 
10148
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
10503
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10504
+ // with the quantization of the output tensor
10505
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10506
+ (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
10149
10507
  int nx = tensor->ne[0];
10150
10508
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10151
10509
  new_type = GGML_TYPE_Q8_0;
10152
10510
  }
10153
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10511
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10154
10512
  new_type = GGML_TYPE_Q5_K;
10155
10513
  }
10156
10514
  else if (new_type != GGML_TYPE_Q8_0) {
10157
10515
  new_type = GGML_TYPE_Q6_K;
10158
10516
  }
10159
10517
  } else if (name == "token_embd.weight") {
10160
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10518
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10161
10519
  new_type = GGML_TYPE_Q2_K;
10162
10520
  }
10163
10521
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10164
10522
  new_type = GGML_TYPE_Q4_K;
10165
10523
  }
10166
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10524
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10167
10525
  if (name.find("attn_v.weight") != std::string::npos) {
10168
10526
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10169
10527
  else new_type = GGML_TYPE_Q2_K;
@@ -10173,6 +10531,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10173
10531
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10174
10532
  ++qs.i_ffn_down;
10175
10533
  }
10534
+ else if (name.find("attn_output.weight") != std::string::npos) {
10535
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10536
+ }
10176
10537
  } else if (name.find("attn_v.weight") != std::string::npos) {
10177
10538
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
10178
10539
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -10187,6 +10548,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10187
10548
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10188
10549
  }
10189
10550
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
10552
+ new_type = GGML_TYPE_Q5_K;
10553
+ }
10190
10554
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
10191
10555
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
10192
10556
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -10239,6 +10603,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10239
10603
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10240
10604
  }
10241
10605
  }
10606
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
10608
+ }
10242
10609
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10243
10610
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
10244
10611
  new_type = GGML_TYPE_Q5_K;
@@ -10255,7 +10622,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10255
10622
  if (arch != LLM_ARCH_FALCON) {
10256
10623
  if (qs.model.hparams.n_expert == 8) {
10257
10624
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10258
- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
10625
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10259
10626
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10260
10627
  new_type = GGML_TYPE_Q5_K;
10261
10628
  }
@@ -10306,7 +10673,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10306
10673
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10307
10674
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10308
10675
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10309
- new_type == GGML_TYPE_IQ3_XXS) {
10676
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10310
10677
  int nx = tensor->ne[0];
10311
10678
  int ny = tensor->ne[1];
10312
10679
  if (nx % QK_K != 0) {
@@ -10321,8 +10688,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10321
10688
  case GGML_TYPE_IQ2_XXS:
10322
10689
  case GGML_TYPE_IQ2_XS:
10323
10690
  case GGML_TYPE_IQ3_XXS:
10324
- case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
10325
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10691
+ case GGML_TYPE_IQ1_S:
10692
+ case GGML_TYPE_Q2_K:
10693
+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10326
10694
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10327
10695
  case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10328
10696
  case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
@@ -10363,6 +10731,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10363
10731
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10364
10732
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10365
10733
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10366
10736
 
10367
10737
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10368
10738
  }
@@ -10536,6 +10906,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10536
10906
  }
10537
10907
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10538
10908
  new_type == GGML_TYPE_IQ2_XS ||
10909
+ new_type == GGML_TYPE_IQ1_S ||
10539
10910
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10540
10911
  LLAMA_LOG_ERROR("\n\n============================================================\n");
10541
10912
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -10770,7 +11141,7 @@ static int llama_apply_lora_from_file_internal(
10770
11141
  {
10771
11142
  LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
10772
11143
  __func__, ftype);
10773
- return false;
11144
+ return 1;
10774
11145
  }
10775
11146
  }
10776
11147
 
@@ -11059,7 +11430,7 @@ bool llama_mlock_supported(void) {
11059
11430
  return llama_supports_mlock();
11060
11431
  }
11061
11432
 
11062
- void llama_backend_init(bool numa) {
11433
+ void llama_backend_init(void) {
11063
11434
  ggml_time_init();
11064
11435
 
11065
11436
  // needed to initialize f16 tables
@@ -11069,15 +11440,17 @@ void llama_backend_init(bool numa) {
11069
11440
  ggml_free(ctx);
11070
11441
  }
11071
11442
 
11072
- if (numa) {
11073
- ggml_numa_init();
11074
- }
11075
-
11076
11443
  #ifdef GGML_USE_MPI
11077
11444
  ggml_mpi_backend_init();
11078
11445
  #endif
11079
11446
  }
11080
11447
 
11448
+ void llama_numa_init(enum ggml_numa_strategy numa) {
11449
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
11450
+ ggml_numa_init(numa);
11451
+ }
11452
+ }
11453
+
11081
11454
  void llama_backend_free(void) {
11082
11455
  #ifdef GGML_USE_MPI
11083
11456
  ggml_mpi_backend_free();
@@ -11309,7 +11682,7 @@ struct llama_context * llama_new_context_with_model(
11309
11682
  // graph inputs
11310
11683
  {
11311
11684
  ggml_init_params init_params = {
11312
- /* .mem_size */ ggml_tensor_overhead()*7,
11685
+ /* .mem_size */ ggml_tensor_overhead()*8,
11313
11686
  /* .mem_buffer */ nullptr,
11314
11687
  /* .no_alloc */ true,
11315
11688
  };
@@ -11319,15 +11692,19 @@ struct llama_context * llama_new_context_with_model(
11319
11692
  ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11320
11693
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11321
11694
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11695
+ ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
11322
11696
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11323
- ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11697
+ ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11698
+ ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11324
11699
 
11325
11700
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
11326
11701
  ggml_set_name(ctx->inp_embd, "inp_embd");
11327
11702
  ggml_set_name(ctx->inp_pos, "inp_pos");
11328
11703
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11704
+ ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
11329
11705
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11330
- ggml_set_name(ctx->inp_sum, "inp_sum");
11706
+ ggml_set_name(ctx->inp_mean, "inp_mean");
11707
+ ggml_set_name(ctx->inp_cls, "inp_cls");
11331
11708
 
11332
11709
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11333
11710
 
@@ -11819,18 +12196,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
11819
12196
  data_ctx->write(&kv_used, sizeof(kv_used));
11820
12197
 
11821
12198
  if (kv_buf_size) {
11822
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11823
-
11824
12199
  std::vector<uint8_t> tmp_buf;
11825
12200
  for (int il = 0; il < (int) n_layer; ++il) {
11826
- tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
12201
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12202
+ tmp_buf.resize(k_size);
11827
12203
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
11828
12204
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11829
12205
 
11830
12206
  // v is not contiguous, copy row by row
11831
- tmp_buf.resize(elt_size*kv_head);
12207
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12209
+ tmp_buf.resize(v_row_size);
11832
12210
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11833
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
12211
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
11834
12212
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11835
12213
  }
11836
12214
  }
@@ -11932,17 +12310,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11932
12310
  if (kv_buf_size) {
11933
12311
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
11934
12312
 
11935
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11936
-
11937
12313
  for (int il = 0; il < (int) n_layer; ++il) {
11938
- size_t k_size = elt_size*n_embd_k_gqa*kv_head;
12314
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
11939
12315
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
11940
12316
  inp += k_size;
11941
12317
 
11942
12318
  // v is not contiguous, copy row by row
11943
- size_t v_row_size = elt_size*kv_head;
12319
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
11944
12321
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11945
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
12322
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
11946
12323
  inp += v_row_size;
11947
12324
  }
11948
12325
  }
@@ -12332,6 +12709,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12332
12709
  return 0;
12333
12710
  }
12334
12711
 
12712
+ // trim whitespace from the beginning and end of a string
12713
+ static std::string trim(const std::string & str) {
12714
+ size_t start = 0;
12715
+ size_t end = str.size();
12716
+ while (start < end && isspace(str[start])) {
12717
+ start += 1;
12718
+ }
12719
+ while (end > start && isspace(str[end - 1])) {
12720
+ end -= 1;
12721
+ }
12722
+ return str.substr(start, end - start);
12723
+ }
12724
+
12725
+ // Simple version of "llama_apply_chat_template" that only works with strings
12726
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
12727
+ static int32_t llama_chat_apply_template_internal(
12728
+ const std::string & tmpl,
12729
+ const std::vector<const llama_chat_message *> & chat,
12730
+ std::string & dest, bool add_ass) {
12731
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12732
+ std::stringstream ss;
12733
+ if (tmpl.find("<|im_start|>") != std::string::npos) {
12734
+ // chatml template
12735
+ for (auto message : chat) {
12736
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
12737
+ }
12738
+ if (add_ass) {
12739
+ ss << "<|im_start|>assistant\n";
12740
+ }
12741
+ } else if (tmpl.find("[INST]") != std::string::npos) {
12742
+ // llama2 template and its variants
12743
+ // [variant] support system message
12744
+ bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
12745
+ // [variant] space before + after response
12746
+ bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
12747
+ // [variant] add BOS inside history
12748
+ bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
12749
+ // [variant] trim spaces from the input message
12750
+ bool strip_message = tmpl.find("content.strip()") != std::string::npos;
12751
+ // construct the prompt
12752
+ bool is_inside_turn = true; // skip BOS at the beginning
12753
+ ss << "[INST] ";
12754
+ for (auto message : chat) {
12755
+ std::string content = strip_message ? trim(message->content) : message->content;
12756
+ std::string role(message->role);
12757
+ if (!is_inside_turn) {
12758
+ is_inside_turn = true;
12759
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
12760
+ }
12761
+ if (role == "system") {
12762
+ if (support_system_message) {
12763
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
12764
+ } else {
12765
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
12766
+ ss << content << "\n";
12767
+ }
12768
+ } else if (role == "user") {
12769
+ ss << content << " [/INST]";
12770
+ } else {
12771
+ ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
12772
+ is_inside_turn = false;
12773
+ }
12774
+ }
12775
+ // llama2 templates seem to not care about "add_generation_prompt"
12776
+ } else if (tmpl.find("<|user|>") != std::string::npos) {
12777
+ // zephyr template
12778
+ for (auto message : chat) {
12779
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
12780
+ }
12781
+ if (add_ass) {
12782
+ ss << "<|assistant|>\n";
12783
+ }
12784
+ } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
12785
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
12786
+ for (auto message : chat) {
12787
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
12788
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
12789
+ }
12790
+ if (add_ass) {
12791
+ ss << "<s>assistant\n";
12792
+ }
12793
+ } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
12794
+ // google/gemma-7b-it
12795
+ std::string system_prompt = "";
12796
+ for (auto message : chat) {
12797
+ std::string role(message->role);
12798
+ if (role == "system") {
12799
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
12800
+ system_prompt = trim(message->content);
12801
+ continue;
12802
+ }
12803
+ // in gemma, "assistant" is "model"
12804
+ role = role == "assistant" ? "model" : message->role;
12805
+ ss << "<start_of_turn>" << role << "\n";
12806
+ if (!system_prompt.empty() && role != "model") {
12807
+ ss << system_prompt << "\n\n";
12808
+ system_prompt = "";
12809
+ }
12810
+ ss << trim(message->content) << "<end_of_turn>\n";
12811
+ }
12812
+ if (add_ass) {
12813
+ ss << "<start_of_turn>model\n";
12814
+ }
12815
+ } else {
12816
+ // template not supported
12817
+ return -1;
12818
+ }
12819
+ dest = ss.str();
12820
+ return dest.size();
12821
+ }
12822
+
12823
+ LLAMA_API int32_t llama_chat_apply_template(
12824
+ const struct llama_model * model,
12825
+ const char * tmpl,
12826
+ const struct llama_chat_message * chat,
12827
+ size_t n_msg,
12828
+ bool add_ass,
12829
+ char * buf,
12830
+ int32_t length) {
12831
+ std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
12832
+ if (tmpl == nullptr) {
12833
+ GGML_ASSERT(model != nullptr);
12834
+ // load template from model
12835
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12836
+ std::string template_key = "tokenizer.chat_template";
12837
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
12838
+ if (res < 0) {
12839
+ // worst case: there is no information about template, we will use chatml by default
12840
+ curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
12841
+ } else {
12842
+ curr_tmpl = std::string(model_template.data(), model_template.size());
12843
+ }
12844
+ }
12845
+ // format the chat to string
12846
+ std::vector<const llama_chat_message *> chat_vec;
12847
+ chat_vec.resize(n_msg);
12848
+ for (size_t i = 0; i < n_msg; i++) {
12849
+ chat_vec[i] = &chat[i];
12850
+ }
12851
+ std::string formatted_chat;
12852
+ int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12853
+ if (res < 0) {
12854
+ return res;
12855
+ }
12856
+ strncpy(buf, formatted_chat.c_str(), length);
12857
+ return res;
12858
+ }
12859
+
12335
12860
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
12336
12861
  struct llama_timings result = {
12337
12862
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,