llama_cpp 0.12.6 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -197,6 +197,7 @@ enum llm_arch {
197
197
  LLM_ARCH_PERSIMMON,
198
198
  LLM_ARCH_REFACT,
199
199
  LLM_ARCH_BERT,
200
+ LLM_ARCH_NOMIC_BERT,
200
201
  LLM_ARCH_BLOOM,
201
202
  LLM_ARCH_STABLELM,
202
203
  LLM_ARCH_QWEN,
@@ -207,31 +208,34 @@ enum llm_arch {
207
208
  LLM_ARCH_ORION,
208
209
  LLM_ARCH_INTERNLM2,
209
210
  LLM_ARCH_MINICPM,
211
+ LLM_ARCH_GEMMA,
210
212
  LLM_ARCH_UNKNOWN,
211
213
  };
212
214
 
213
215
  static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
214
- { LLM_ARCH_LLAMA, "llama" },
215
- { LLM_ARCH_FALCON, "falcon" },
216
- { LLM_ARCH_GPT2, "gpt2" },
217
- { LLM_ARCH_GPTJ, "gptj" },
218
- { LLM_ARCH_GPTNEOX, "gptneox" },
219
- { LLM_ARCH_MPT, "mpt" },
220
- { LLM_ARCH_BAICHUAN, "baichuan" },
221
- { LLM_ARCH_STARCODER, "starcoder" },
222
- { LLM_ARCH_PERSIMMON, "persimmon" },
223
- { LLM_ARCH_REFACT, "refact" },
224
- { LLM_ARCH_BERT, "bert" },
225
- { LLM_ARCH_BLOOM, "bloom" },
226
- { LLM_ARCH_STABLELM, "stablelm" },
227
- { LLM_ARCH_QWEN, "qwen" },
228
- { LLM_ARCH_QWEN2, "qwen2" },
229
- { LLM_ARCH_PHI2, "phi2" },
230
- { LLM_ARCH_PLAMO, "plamo" },
231
- { LLM_ARCH_CODESHELL, "codeshell" },
232
- { LLM_ARCH_ORION, "orion" },
233
- { LLM_ARCH_INTERNLM2, "internlm2" },
234
- { LLM_ARCH_MINICPM, "minicpm" },
216
+ { LLM_ARCH_LLAMA, "llama" },
217
+ { LLM_ARCH_FALCON, "falcon" },
218
+ { LLM_ARCH_GPT2, "gpt2" },
219
+ { LLM_ARCH_GPTJ, "gptj" },
220
+ { LLM_ARCH_GPTNEOX, "gptneox" },
221
+ { LLM_ARCH_MPT, "mpt" },
222
+ { LLM_ARCH_BAICHUAN, "baichuan" },
223
+ { LLM_ARCH_STARCODER, "starcoder" },
224
+ { LLM_ARCH_PERSIMMON, "persimmon" },
225
+ { LLM_ARCH_REFACT, "refact" },
226
+ { LLM_ARCH_BERT, "bert" },
227
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
228
+ { LLM_ARCH_BLOOM, "bloom" },
229
+ { LLM_ARCH_STABLELM, "stablelm" },
230
+ { LLM_ARCH_QWEN, "qwen" },
231
+ { LLM_ARCH_QWEN2, "qwen2" },
232
+ { LLM_ARCH_PHI2, "phi2" },
233
+ { LLM_ARCH_PLAMO, "plamo" },
234
+ { LLM_ARCH_CODESHELL, "codeshell" },
235
+ { LLM_ARCH_ORION, "orion" },
236
+ { LLM_ARCH_INTERNLM2, "internlm2" },
237
+ { LLM_ARCH_MINICPM, "minicpm" },
238
+ { LLM_ARCH_GEMMA, "gemma" },
235
239
  };
236
240
 
237
241
  enum llm_kv {
@@ -254,7 +258,7 @@ enum llm_kv {
254
258
  LLM_KV_TENSOR_DATA_LAYOUT,
255
259
  LLM_KV_EXPERT_COUNT,
256
260
  LLM_KV_EXPERT_USED_COUNT,
257
- LLM_KV_POOLING_LAYER,
261
+ LLM_KV_POOLING_TYPE,
258
262
 
259
263
  LLM_KV_ATTENTION_HEAD_COUNT,
260
264
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -312,7 +316,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
312
316
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
313
317
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
314
318
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
315
- { LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
319
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
316
320
 
317
321
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
318
322
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -375,6 +379,7 @@ enum llm_tensor {
375
379
  LLM_TENSOR_ATTN_OUT,
376
380
  LLM_TENSOR_ATTN_NORM,
377
381
  LLM_TENSOR_ATTN_NORM_2,
382
+ LLM_TENSOR_ATTN_OUT_NORM,
378
383
  LLM_TENSOR_ATTN_ROT_EMBD,
379
384
  LLM_TENSOR_FFN_GATE_INP,
380
385
  LLM_TENSOR_FFN_NORM,
@@ -387,6 +392,7 @@ enum llm_tensor {
387
392
  LLM_TENSOR_FFN_UP_EXP,
388
393
  LLM_TENSOR_ATTN_Q_NORM,
389
394
  LLM_TENSOR_ATTN_K_NORM,
395
+ LLM_TENSOR_LAYER_OUT_NORM,
390
396
  };
391
397
 
392
398
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -503,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
503
509
  {
504
510
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
505
511
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
506
- { LLM_TENSOR_OUTPUT, "output" },
507
512
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
508
513
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
509
514
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -552,12 +557,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
552
557
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
553
558
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
554
559
  { LLM_TENSOR_POS_EMBD, "position_embd" },
555
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
560
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
556
561
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
557
562
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
558
563
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
559
564
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
560
- { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
565
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
566
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
567
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
+ },
569
+ },
570
+ {
571
+ LLM_ARCH_NOMIC_BERT,
572
+ {
573
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
574
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
575
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
576
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
577
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
578
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
579
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
580
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
561
581
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
562
582
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
563
583
  },
@@ -741,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
741
761
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
742
762
  },
743
763
  },
764
+ {
765
+ LLM_ARCH_GEMMA,
766
+ {
767
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
768
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
769
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
770
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
771
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
772
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
773
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
774
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
775
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
776
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
777
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
778
+ },
779
+ },
744
780
  {
745
781
  LLM_ARCH_UNKNOWN,
746
782
  {
@@ -1015,7 +1051,7 @@ struct llama_mmap {
1015
1051
  int fd = fileno(file->fp);
1016
1052
  int flags = MAP_SHARED;
1017
1053
  // prefetch/readahead impairs performance on NUMA systems
1018
- if (numa) { prefetch = 0; }
1054
+ if (numa) { prefetch = 0; }
1019
1055
  #ifdef __linux__
1020
1056
  // advise the kernel to read the file sequentially (increases readahead)
1021
1057
  if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -1485,6 +1521,7 @@ enum e_model {
1485
1521
  MODEL_22M,
1486
1522
  MODEL_33M,
1487
1523
  MODEL_109M,
1524
+ MODEL_137M,
1488
1525
  MODEL_335M,
1489
1526
  MODEL_0_5B,
1490
1527
  MODEL_1B,
@@ -1537,12 +1574,13 @@ struct llama_hparams {
1537
1574
  uint32_t n_yarn_orig_ctx;
1538
1575
  int32_t rope_scaling_type_train;
1539
1576
 
1540
- float f_clamp_kqv;
1541
- float f_max_alibi_bias;
1577
+ float f_clamp_kqv = 0.0f;
1578
+ float f_max_alibi_bias = 0.0f;
1542
1579
 
1543
1580
  bool causal_attn = true;
1544
- bool pooling_layer = false;
1581
+ bool need_kq_pos = false;
1545
1582
 
1583
+ uint32_t pooling_type = LLAMA_POOLING_NONE;
1546
1584
 
1547
1585
  bool operator!=(const llama_hparams & other) const {
1548
1586
  if (this->vocab_only != other.vocab_only) return true;
@@ -1620,6 +1658,8 @@ struct llama_layer {
1620
1658
  struct ggml_tensor * attn_q_norm_b;
1621
1659
  struct ggml_tensor * attn_k_norm;
1622
1660
  struct ggml_tensor * attn_k_norm_b;
1661
+ struct ggml_tensor * attn_out_norm;
1662
+ struct ggml_tensor * attn_out_norm_b;
1623
1663
 
1624
1664
  // attention
1625
1665
  struct ggml_tensor * wq;
@@ -1638,6 +1678,8 @@ struct llama_layer {
1638
1678
  // normalization
1639
1679
  struct ggml_tensor * ffn_norm;
1640
1680
  struct ggml_tensor * ffn_norm_b;
1681
+ struct ggml_tensor * layer_out_norm;
1682
+ struct ggml_tensor * layer_out_norm_b;
1641
1683
 
1642
1684
  // ff
1643
1685
  struct ggml_tensor * ffn_gate; // w1
@@ -1899,8 +1941,10 @@ struct llama_context {
1899
1941
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1900
1942
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1901
1943
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1944
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1902
1945
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1903
- struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1946
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1947
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
1904
1948
 
1905
1949
  #ifdef GGML_USE_MPI
1906
1950
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2499,6 +2543,8 @@ struct llama_model_loader {
2499
2543
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2500
2544
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2501
2545
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
+ case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2502
2548
  default:
2503
2549
  {
2504
2550
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2744,13 +2790,7 @@ struct llama_model_loader {
2744
2790
 
2745
2791
  std::vector<no_init<uint8_t>> read_buf;
2746
2792
 
2747
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2748
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2749
- if (!cur) {
2750
- // some tensors may be allocated in a different context
2751
- continue;
2752
- }
2753
-
2793
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
2754
2794
  if (progress_callback) {
2755
2795
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2756
2796
  return false;
@@ -2848,6 +2888,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2848
2888
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2849
2889
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2850
2890
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
+ case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2851
2893
 
2852
2894
  default: return "unknown, may not work";
2853
2895
  }
@@ -2855,6 +2897,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2855
2897
 
2856
2898
  static const char * llama_model_type_name(e_model type) {
2857
2899
  switch (type) {
2900
+ case MODEL_22M: return "22M";
2901
+ case MODEL_33M: return "33M";
2902
+ case MODEL_109M: return "109M";
2903
+ case MODEL_137M: return "137M";
2904
+ case MODEL_0_5B: return "0.5B";
2858
2905
  case MODEL_1B: return "1B";
2859
2906
  case MODEL_2B: return "2B";
2860
2907
  case MODEL_3B: return "3B";
@@ -3024,6 +3071,11 @@ static void llm_load_hparams(
3024
3071
  case 40: model.type = e_model::MODEL_13B; break;
3025
3072
  default: model.type = e_model::MODEL_UNKNOWN;
3026
3073
  }
3074
+
3075
+ if (model.type == e_model::MODEL_13B) {
3076
+ // TODO: become GGUF KV parameter
3077
+ hparams.f_max_alibi_bias = 8.0f;
3078
+ }
3027
3079
  } break;
3028
3080
  case LLM_ARCH_STARCODER:
3029
3081
  {
@@ -3051,13 +3103,16 @@ static void llm_load_hparams(
3051
3103
  case 32: model.type = e_model::MODEL_1B; break;
3052
3104
  default: model.type = e_model::MODEL_UNKNOWN;
3053
3105
  }
3106
+
3107
+ // TODO: become GGUF KV parameter
3108
+ hparams.f_max_alibi_bias = 8.0f;
3054
3109
  } break;
3055
3110
  case LLM_ARCH_BERT:
3056
3111
  {
3057
3112
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3058
3113
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3059
3114
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3060
- ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3115
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3061
3116
 
3062
3117
  switch (hparams.n_layer) {
3063
3118
  case 3:
@@ -3073,6 +3128,17 @@ static void llm_load_hparams(
3073
3128
  model.type = e_model::MODEL_335M; break; // bge-large
3074
3129
  }
3075
3130
  } break;
3131
+ case LLM_ARCH_NOMIC_BERT:
3132
+ {
3133
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
+
3138
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
+ model.type = e_model::MODEL_137M;
3140
+ }
3141
+ } break;
3076
3142
  case LLM_ARCH_BLOOM:
3077
3143
  {
3078
3144
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3085,11 +3151,12 @@ static void llm_load_hparams(
3085
3151
  case 4096: model.type = e_model::MODEL_7B; break;
3086
3152
  } break;
3087
3153
  }
3154
+
3155
+ // TODO: become GGUF KV parameter
3156
+ hparams.f_max_alibi_bias = 8.0f;
3088
3157
  } break;
3089
3158
  case LLM_ARCH_MPT:
3090
3159
  {
3091
- hparams.f_clamp_kqv = 0.0f;
3092
-
3093
3160
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3094
3161
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
3095
3162
  ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
@@ -3187,10 +3254,24 @@ static void llm_load_hparams(
3187
3254
  default: model.type = e_model::MODEL_UNKNOWN;
3188
3255
  }
3189
3256
  } break;
3257
+ case LLM_ARCH_GEMMA:
3258
+ {
3259
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3260
+
3261
+ switch (hparams.n_layer) {
3262
+ case 18: model.type = e_model::MODEL_2B; break;
3263
+ case 28: model.type = e_model::MODEL_7B; break;
3264
+ default: model.type = e_model::MODEL_UNKNOWN;
3265
+ }
3266
+ } break;
3190
3267
  default: (void)0;
3191
3268
  }
3192
3269
 
3193
3270
  model.ftype = ml.ftype;
3271
+
3272
+ if (hparams.f_max_alibi_bias > 0.0f) {
3273
+ hparams.need_kq_pos = true;
3274
+ }
3194
3275
  }
3195
3276
 
3196
3277
  // TODO: This should probably be in llama.h
@@ -3634,7 +3715,7 @@ static bool llm_load_tensors(
3634
3715
  }
3635
3716
 
3636
3717
  // create one context per buffer type
3637
- size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3718
+ size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
3638
3719
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3639
3720
  for (auto & it : buft_layer_count) {
3640
3721
  struct ggml_init_params params = {
@@ -3772,6 +3853,7 @@ static bool llm_load_tensors(
3772
3853
  } else {
3773
3854
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3774
3855
  ml.n_created--; // artificial tensor
3856
+ ml.size_data += ggml_nbytes(model.output);
3775
3857
  }
3776
3858
  }
3777
3859
 
@@ -3875,10 +3957,14 @@ static bool llm_load_tensors(
3875
3957
  }
3876
3958
  } break;
3877
3959
  case LLM_ARCH_BERT:
3960
+ case LLM_ARCH_NOMIC_BERT:
3878
3961
  {
3879
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3880
- model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3881
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3962
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3963
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3964
+ if (model.arch == LLM_ARCH_BERT) {
3965
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3966
+ }
3967
+
3882
3968
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3883
3969
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3884
3970
 
@@ -3888,29 +3974,38 @@ static bool llm_load_tensors(
3888
3974
 
3889
3975
  auto & layer = model.layers[i];
3890
3976
 
3891
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3892
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3977
+ if (model.arch == LLM_ARCH_BERT) {
3978
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3979
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3893
3980
 
3894
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3895
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3981
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3982
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3896
3983
 
3897
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3898
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3984
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3985
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3986
+ } else {
3987
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3988
+ }
3899
3989
 
3900
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3901
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3990
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3902
3991
 
3903
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3904
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3992
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
3993
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
3905
3994
 
3906
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3907
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3995
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3996
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3908
3997
 
3909
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3910
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3998
+ if (model.arch == LLM_ARCH_BERT) {
3999
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4000
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3911
4001
 
3912
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3913
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4002
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4003
+ } else {
4004
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4005
+ }
4006
+
4007
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4008
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
3914
4009
  }
3915
4010
  } break;
3916
4011
  case LLM_ARCH_BLOOM:
@@ -3958,7 +4053,12 @@ static bool llm_load_tensors(
3958
4053
  // output
3959
4054
  {
3960
4055
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3961
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4056
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4057
+
4058
+ // same as tok_embd, duplicated to allow offloading
4059
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4060
+ ml.n_created--; // artificial tensor
4061
+ ml.size_data += ggml_nbytes(model.output);
3962
4062
  }
3963
4063
 
3964
4064
  for (int i = 0; i < n_layer; ++i) {
@@ -3967,14 +4067,23 @@ static bool llm_load_tensors(
3967
4067
 
3968
4068
  auto & layer = model.layers[i];
3969
4069
 
3970
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4070
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4071
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
3971
4072
 
3972
4073
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4074
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
4075
+
3973
4076
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4077
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
3974
4078
 
3975
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3976
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3977
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4079
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4080
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4081
+
4082
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4083
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
4084
+
4085
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4086
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
3978
4087
 
3979
4088
  // AWQ ScaleActivation layer
3980
4089
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -4287,6 +4396,40 @@ static bool llm_load_tensors(
4287
4396
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4288
4397
  }
4289
4398
  } break;
4399
+ case LLM_ARCH_GEMMA:
4400
+ {
4401
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4402
+
4403
+ // output
4404
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4405
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
4406
+ ml.n_created--; // artificial tensor
4407
+ ml.size_data += ggml_nbytes(model.output);
4408
+
4409
+ const int64_t n_ff = hparams.n_ff;
4410
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4411
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4412
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4413
+
4414
+ for (uint32_t i = 0; i < n_layer; ++i) {
4415
+ ggml_context * ctx_layer = ctx_for_layer(i);
4416
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4417
+
4418
+ auto & layer = model.layers[i];
4419
+
4420
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4421
+
4422
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
4423
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
4424
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
4425
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
4426
+
4427
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4428
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4429
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4430
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4431
+ }
4432
+ } break;
4290
4433
  default:
4291
4434
  throw std::runtime_error("unknown architecture");
4292
4435
  }
@@ -4720,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
4720
4863
  struct ggml_tensor * wo_b,
4721
4864
  struct ggml_tensor * q_cur,
4722
4865
  struct ggml_tensor * kq_mask,
4866
+ struct ggml_tensor * kq_pos,
4723
4867
  int64_t n_ctx,
4724
4868
  int32_t n_tokens,
4725
4869
  int32_t n_kv,
4726
- float max_alibi_bias,
4727
4870
  float kq_scale,
4728
4871
  const llm_build_cb & cb,
4729
4872
  int il) {
@@ -4753,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
4753
4896
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4754
4897
  }
4755
4898
 
4756
- if (max_alibi_bias > 0.0f) {
4757
- // temporary branch until we figure out how to handle ggml_alibi through ggml_add
4899
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
4901
+ #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
+ if (hparams.f_max_alibi_bias > 0.0f) {
4758
4904
  kq = ggml_scale(ctx, kq, kq_scale);
4759
4905
  cb(kq, "kq_scaled", il);
4760
4906
 
4761
- if (max_alibi_bias > 0.0f) {
4762
- // TODO: n_head or n_head_kv
4763
- // TODO: K-shift is likely not working
4764
- // TODO: change to ggml_add
4765
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
4766
- cb(kq, "kq_scaled_alibi", il);
4767
- }
4907
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
4908
+ cb(kq, "kq_scaled_alibi", il);
4768
4909
 
4769
4910
  kq = ggml_add(ctx, kq, kq_mask);
4770
4911
  cb(kq, "kq_masked", il);
4771
4912
 
4772
4913
  kq = ggml_soft_max(ctx, kq);
4773
4914
  cb(kq, "kq_soft_max", il);
4774
- } else {
4775
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
4915
+ } else
4916
+ #endif
4917
+ {
4918
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
4776
4919
  cb(kq, "kq_soft_max_ext", il);
4777
4920
  }
4778
4921
 
@@ -4820,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
4820
4963
  struct ggml_tensor * v_cur,
4821
4964
  struct ggml_tensor * q_cur,
4822
4965
  struct ggml_tensor * kq_mask,
4966
+ struct ggml_tensor * kq_pos,
4823
4967
  int64_t n_ctx,
4824
4968
  int32_t n_tokens,
4825
4969
  int32_t kv_head,
4826
4970
  int32_t n_kv,
4827
- float max_alibi_bias,
4828
4971
  float kq_scale,
4829
4972
  const llm_build_cb & cb,
4830
4973
  int il) {
@@ -4838,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
4838
4981
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4839
4982
 
4840
4983
  struct ggml_tensor * cur;
4841
- cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4842
- wo, wo_b,
4843
- q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4984
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4985
+ q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4844
4986
  cb(cur, "kqv_out", il);
4845
4987
 
4846
4988
  return cur;
@@ -4881,7 +5023,7 @@ struct llm_build_context {
4881
5023
  const int32_t n_orig_ctx;
4882
5024
 
4883
5025
  const bool do_rope_shift;
4884
- const bool do_pooling;
5026
+ const uint32_t pooling_type;
4885
5027
 
4886
5028
  const llm_build_cb & cb;
4887
5029
 
@@ -4925,7 +5067,7 @@ struct llm_build_context {
4925
5067
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4926
5068
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4927
5069
  do_rope_shift (worst_case || kv_self.has_shift),
4928
- do_pooling (hparams.pooling_layer && cparams.do_pooling),
5070
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
4929
5071
  cb (cb),
4930
5072
  buf_compute_meta (lctx.buf_compute_meta) {
4931
5073
  // all initializations should be done in init()
@@ -5008,7 +5150,7 @@ struct llm_build_context {
5008
5150
  }
5009
5151
 
5010
5152
  Qcur = ggml_rope_custom(
5011
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5153
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5012
5154
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5013
5155
  ext_factor, attn_factor, beta_fast, beta_slow
5014
5156
  );
@@ -5023,7 +5165,7 @@ struct llm_build_context {
5023
5165
 
5024
5166
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5025
5167
  model.layers[il].wo, model.layers[il].bo,
5026
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5168
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5027
5169
  cb(cur, "kqv_out", il);
5028
5170
  }
5029
5171
 
@@ -5153,6 +5295,10 @@ struct llm_build_context {
5153
5295
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5154
5296
  cb(KQ_mask, "KQ_mask", -1);
5155
5297
 
5298
+ // positions of the tokens in the KV cache
5299
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
+ cb(KQ_pos, "KQ_pos", -1);
5301
+
5156
5302
  // shift the entire K-cache if needed
5157
5303
  if (do_rope_shift) {
5158
5304
  llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
@@ -5201,12 +5347,9 @@ struct llm_build_context {
5201
5347
  cb(Kcur, "Kcur", il);
5202
5348
 
5203
5349
 
5204
- // apply ALiBi for 13B model
5205
- const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
5206
-
5207
5350
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5208
5351
  model.layers[il].wo, NULL,
5209
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5352
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5210
5353
  cb(cur, "kqv_out", il);
5211
5354
  }
5212
5355
 
@@ -5330,7 +5473,7 @@ struct llm_build_context {
5330
5473
 
5331
5474
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5332
5475
  model.layers[il].wo, NULL,
5333
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5476
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5334
5477
  cb(cur, "kqv_out", il);
5335
5478
  }
5336
5479
 
@@ -5429,7 +5572,7 @@ struct llm_build_context {
5429
5572
 
5430
5573
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5431
5574
  model.layers[il].wo, model.layers[il].bo,
5432
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5575
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5433
5576
  cb(cur, "kqv_out", il);
5434
5577
  }
5435
5578
 
@@ -5634,7 +5777,7 @@ struct llm_build_context {
5634
5777
 
5635
5778
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5636
5779
  model.layers[il].wo, model.layers[il].bo,
5637
- Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5780
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5638
5781
  cb(cur, "kqv_out", il);
5639
5782
  }
5640
5783
 
@@ -5696,6 +5839,10 @@ struct llm_build_context {
5696
5839
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5697
5840
  cb(KQ_mask, "KQ_mask", -1);
5698
5841
 
5842
+ // positions of the tokens in the KV cache
5843
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5844
+ cb(KQ_pos, "KQ_pos", -1);
5845
+
5699
5846
  for (int il = 0; il < n_layer; ++il) {
5700
5847
  struct ggml_tensor * inpSA = inpL;
5701
5848
 
@@ -5723,7 +5870,7 @@ struct llm_build_context {
5723
5870
 
5724
5871
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5725
5872
  model.layers[il].wo, NULL,
5726
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5873
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5727
5874
  cb(cur, "kqv_out", il);
5728
5875
  }
5729
5876
 
@@ -5773,6 +5920,7 @@ struct llm_build_context {
5773
5920
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5774
5921
 
5775
5922
  const int64_t n_embd_head = hparams.n_embd_head_v;
5923
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5776
5924
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5777
5925
 
5778
5926
  struct ggml_tensor * cur;
@@ -5781,7 +5929,8 @@ struct llm_build_context {
5781
5929
  // get input vectors with right size
5782
5930
  const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5783
5931
  struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5784
- struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5932
+ struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5933
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5785
5934
 
5786
5935
  // construct input embeddings (token, type, position)
5787
5936
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5789,7 +5938,9 @@ struct llm_build_context {
5789
5938
  // token types are hardcoded to zero ("Sentence A")
5790
5939
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5791
5940
  inpL = ggml_add(ctx0, inpL, type_row0);
5792
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5941
+ if (model.arch == LLM_ARCH_BERT) {
5942
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5943
+ }
5793
5944
  cb(inpL, "inp_embd", -1);
5794
5945
 
5795
5946
  // embed layer norm
@@ -5805,7 +5956,7 @@ struct llm_build_context {
5805
5956
  struct ggml_tensor * cur = inpL;
5806
5957
 
5807
5958
  // self-attention
5808
- {
5959
+ if (model.arch == LLM_ARCH_BERT) {
5809
5960
  struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5810
5961
  cb(Qcur, "Qcur", il);
5811
5962
 
@@ -5820,7 +5971,38 @@ struct llm_build_context {
5820
5971
 
5821
5972
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5822
5973
  model.layers[il].wo, model.layers[il].bo,
5823
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5974
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5975
+ cb(cur, "kqv_out", il);
5976
+ } else {
5977
+ // compute Q and K and RoPE them
5978
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5979
+ cb(cur, "wqkv", il);
5980
+
5981
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5982
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5983
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5984
+
5985
+ cb(Qcur, "Qcur", il);
5986
+ cb(Kcur, "Kcur", il);
5987
+ cb(Vcur, "Vcur", il);
5988
+
5989
+ Qcur = ggml_rope_custom(
5990
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5992
+ ext_factor, attn_factor, beta_fast, beta_slow
5993
+ );
5994
+ cb(Qcur, "Qcur", il);
5995
+
5996
+ Kcur = ggml_rope_custom(
5997
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5999
+ ext_factor, attn_factor, beta_fast, beta_slow
6000
+ );
6001
+ cb(Kcur, "Kcur", il);
6002
+
6003
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6004
+ model.layers[il].wo, model.layers[il].bo,
6005
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5824
6006
  cb(cur, "kqv_out", il);
5825
6007
  }
5826
6008
 
@@ -5828,25 +6010,34 @@ struct llm_build_context {
5828
6010
  cur = ggml_add(ctx0, cur, inpL);
5829
6011
 
5830
6012
  // attention layer norm
5831
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
6013
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
5832
6014
 
5833
6015
  struct ggml_tensor * ffn_inp = cur;
5834
6016
  cb(ffn_inp, "ffn_inp", il);
5835
6017
 
5836
6018
  // feed-forward network
5837
- cur = llm_build_ffn(ctx0, cur,
5838
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5839
- NULL, NULL,
5840
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5841
- NULL,
5842
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6019
+ if (model.arch == LLM_ARCH_BERT) {
6020
+ cur = llm_build_ffn(ctx0, cur,
6021
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6022
+ NULL, NULL,
6023
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6024
+ NULL,
6025
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6026
+ } else {
6027
+ cur = llm_build_ffn(ctx0, cur,
6028
+ model.layers[il].ffn_up, NULL,
6029
+ model.layers[il].ffn_gate, NULL,
6030
+ model.layers[il].ffn_down, NULL,
6031
+ NULL,
6032
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6033
+ }
5843
6034
  cb(cur, "ffn_out", il);
5844
6035
 
5845
6036
  // attentions bypass the intermediate layer
5846
6037
  cur = ggml_add(ctx0, cur, ffn_inp);
5847
6038
 
5848
6039
  // output layer norm
5849
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
6040
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
5850
6041
 
5851
6042
  // input for next layer
5852
6043
  inpL = cur;
@@ -5856,8 +6047,12 @@ struct llm_build_context {
5856
6047
  cur = inpL;
5857
6048
 
5858
6049
  // pooling layer
5859
- if (do_pooling) {
5860
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
6050
+ if (pooling_type == LLAMA_POOLING_MEAN) {
6051
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
+ } else if (pooling_type == LLAMA_POOLING_CLS) {
6053
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
+ } else {
6055
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
5861
6056
  }
5862
6057
  cb(cur, "result_embd", -1);
5863
6058
 
@@ -5883,6 +6078,10 @@ struct llm_build_context {
5883
6078
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5884
6079
  cb(KQ_mask, "KQ_mask", -1);
5885
6080
 
6081
+ // positions of the tokens in the KV cache
6082
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6083
+ cb(KQ_pos, "KQ_pos", -1);
6084
+
5886
6085
  inpL = llm_build_norm(ctx0, inpL, hparams,
5887
6086
  model.tok_norm,
5888
6087
  model.tok_norm_b,
@@ -5916,7 +6115,7 @@ struct llm_build_context {
5916
6115
 
5917
6116
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5918
6117
  model.layers[il].wo, model.layers[il].bo,
5919
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6118
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5920
6119
  cb(cur, "kqv_out", il);
5921
6120
  }
5922
6121
 
@@ -5976,12 +6175,16 @@ struct llm_build_context {
5976
6175
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5977
6176
  cb(KQ_mask, "KQ_mask", -1);
5978
6177
 
6178
+ // positions of the tokens in the KV cache
6179
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6180
+ cb(KQ_pos, "KQ_pos", -1);
6181
+
5979
6182
  for (int il = 0; il < n_layer; ++il) {
5980
6183
  struct ggml_tensor * attn_norm;
5981
6184
 
5982
6185
  attn_norm = llm_build_norm(ctx0, inpL, hparams,
5983
6186
  model.layers[il].attn_norm,
5984
- NULL,
6187
+ model.layers[il].attn_norm_b,
5985
6188
  LLM_NORM, cb, il);
5986
6189
  cb(attn_norm, "attn_norm", il);
5987
6190
 
@@ -5992,6 +6195,11 @@ struct llm_build_context {
5992
6195
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5993
6196
  cb(cur, "wqkv", il);
5994
6197
 
6198
+ if (model.layers[il].bqkv){
6199
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6200
+ cb(cur, "bqkv", il);
6201
+ }
6202
+
5995
6203
  if (hparams.f_clamp_kqv > 0.0f) {
5996
6204
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5997
6205
  cb(cur, "wqkv_clamped", il);
@@ -6008,8 +6216,8 @@ struct llm_build_context {
6008
6216
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6009
6217
 
6010
6218
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6011
- model.layers[il].wo, NULL,
6012
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6219
+ model.layers[il].wo, model.layers[il].bo,
6220
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6013
6221
  cb(cur, "kqv_out", il);
6014
6222
  }
6015
6223
 
@@ -6021,13 +6229,13 @@ struct llm_build_context {
6021
6229
  {
6022
6230
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
6023
6231
  model.layers[il].ffn_norm,
6024
- NULL,
6232
+ model.layers[il].ffn_norm_b,
6025
6233
  LLM_NORM, cb, il);
6026
6234
  cb(cur, "ffn_norm", il);
6027
6235
  cur = llm_build_ffn(ctx0, cur,
6028
- model.layers[il].ffn_up, NULL,
6236
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6029
6237
  NULL, NULL,
6030
- model.layers[il].ffn_down, NULL,
6238
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6031
6239
  model.layers[il].ffn_act,
6032
6240
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6033
6241
  cb(cur, "ffn_out", il);
@@ -6044,7 +6252,7 @@ struct llm_build_context {
6044
6252
 
6045
6253
  cur = llm_build_norm(ctx0, cur, hparams,
6046
6254
  model.output_norm,
6047
- NULL,
6255
+ model.output_norm_b,
6048
6256
  LLM_NORM, cb, -1);
6049
6257
  cb(cur, "result_norm", -1);
6050
6258
 
@@ -6131,7 +6339,7 @@ struct llm_build_context {
6131
6339
 
6132
6340
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6133
6341
  model.layers[il].wo, NULL,
6134
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6342
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6135
6343
  cb(cur, "kqv_out", il);
6136
6344
  }
6137
6345
 
@@ -6246,7 +6454,7 @@ struct llm_build_context {
6246
6454
 
6247
6455
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6248
6456
  model.layers[il].wo, NULL,
6249
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6457
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6250
6458
  cb(cur, "kqv_out", il);
6251
6459
  }
6252
6460
 
@@ -6367,7 +6575,7 @@ struct llm_build_context {
6367
6575
 
6368
6576
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6369
6577
  model.layers[il].wo, model.layers[il].bo,
6370
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6578
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6371
6579
  cb(cur, "kqv_out", il);
6372
6580
  }
6373
6581
 
@@ -6494,7 +6702,7 @@ struct llm_build_context {
6494
6702
 
6495
6703
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6496
6704
  model.layers[il].wo, model.layers[il].bo,
6497
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
6705
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6498
6706
  cb(cur, "kqv_out", il);
6499
6707
  }
6500
6708
 
@@ -6597,7 +6805,7 @@ struct llm_build_context {
6597
6805
 
6598
6806
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6599
6807
  model.layers[il].wo, NULL,
6600
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6601
6809
  cb(cur, "kqv_out", il);
6602
6810
  }
6603
6811
  struct ggml_tensor * sa_out = cur;
@@ -6696,7 +6904,7 @@ struct llm_build_context {
6696
6904
 
6697
6905
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6698
6906
  model.layers[il].wo, model.layers[il].bo,
6699
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6907
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6700
6908
  cb(cur, "kqv_out", il);
6701
6909
  }
6702
6910
 
@@ -6805,7 +7013,7 @@ struct llm_build_context {
6805
7013
 
6806
7014
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6807
7015
  model.layers[il].wo, model.layers[il].bo,
6808
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7016
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6809
7017
  cb(cur, "kqv_out", il);
6810
7018
  }
6811
7019
 
@@ -6923,7 +7131,7 @@ struct llm_build_context {
6923
7131
 
6924
7132
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6925
7133
  model.layers[il].wo, NULL,
6926
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7134
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6927
7135
  cb(cur, "kqv_out", il);
6928
7136
  }
6929
7137
 
@@ -7042,7 +7250,7 @@ struct llm_build_context {
7042
7250
 
7043
7251
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7044
7252
  model.layers[il].wo, model.layers[il].bo,
7045
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7253
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7046
7254
  cb(cur, "kqv_out", il);
7047
7255
  }
7048
7256
 
@@ -7174,7 +7382,7 @@ struct llm_build_context {
7174
7382
 
7175
7383
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7176
7384
  model.layers[il].wo, model.layers[il].bo,
7177
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7385
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7178
7386
  cb(cur, "kqv_out", il);
7179
7387
  }
7180
7388
 
@@ -7233,6 +7441,116 @@ struct llm_build_context {
7233
7441
 
7234
7442
  return gf;
7235
7443
  }
7444
+
7445
+ struct ggml_cgraph * build_gemma() {
7446
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7447
+
7448
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
7449
+
7450
+ struct ggml_tensor * cur;
7451
+ struct ggml_tensor * inpL;
7452
+
7453
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7454
+ cb(inpL, "inp_embd", -1);
7455
+
7456
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7457
+ cb(inpL, "inp_scaled", -1);
7458
+
7459
+ // inp_pos - contains the positions
7460
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7461
+ cb(inp_pos, "inp_pos", -1);
7462
+
7463
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7464
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
+ cb(KQ_mask, "KQ_mask", -1);
7466
+
7467
+ // shift the entire K-cache if needed
7468
+ if (do_rope_shift) {
7469
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
+ }
7471
+
7472
+ for (int il = 0; il < n_layer; ++il) {
7473
+
7474
+ // norm
7475
+ cur = llm_build_norm(ctx0, inpL, hparams,
7476
+ model.layers[il].attn_norm, NULL,
7477
+ LLM_NORM_RMS, cb, il);
7478
+ cb(cur, "attn_norm", il);
7479
+
7480
+ // self-attention
7481
+ {
7482
+ // compute Q and K and RoPE them
7483
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7484
+ cb(Qcur, "Qcur", il);
7485
+
7486
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7487
+ cb(Kcur, "Kcur", il);
7488
+
7489
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7490
+ cb(Vcur, "Vcur", il);
7491
+
7492
+ Qcur = ggml_rope_custom(
7493
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7495
+ ext_factor, attn_factor, beta_fast, beta_slow);
7496
+ cb(Qcur, "Qcur", il);
7497
+
7498
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
7499
+ cb(Qcur, "Qcur_scaled", il);
7500
+
7501
+ Kcur = ggml_rope_custom(
7502
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7504
+ ext_factor, attn_factor, beta_fast, beta_slow);
7505
+ cb(Kcur, "Kcur", il);
7506
+
7507
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
+ model.layers[il].wo, NULL,
7509
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7510
+ cb(cur, "kqv_out", il);
7511
+ }
7512
+
7513
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
7514
+ cb(sa_out, "sa_out", il);
7515
+
7516
+ cur = llm_build_norm(ctx0, sa_out, hparams,
7517
+ model.layers[il].ffn_norm, NULL,
7518
+ LLM_NORM_RMS, cb, il);
7519
+ cb(cur, "ffn_norm", il);
7520
+
7521
+ // feed-forward network
7522
+ {
7523
+ cur = llm_build_ffn(ctx0, cur,
7524
+ model.layers[il].ffn_up, NULL,
7525
+ model.layers[il].ffn_gate, NULL,
7526
+ model.layers[il].ffn_down, NULL,
7527
+ NULL,
7528
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
7529
+ cb(cur, "ffn_out", il);
7530
+ }
7531
+
7532
+ cur = ggml_add(ctx0, cur, sa_out);
7533
+ cb(cur, "l_out", il);
7534
+
7535
+ // input for next layer
7536
+ inpL = cur;
7537
+ }
7538
+
7539
+ cur = inpL;
7540
+
7541
+ cur = llm_build_norm(ctx0, cur, hparams,
7542
+ model.output_norm, NULL,
7543
+ LLM_NORM_RMS, cb, -1);
7544
+ cb(cur, "result_norm", -1);
7545
+
7546
+ // lm_head
7547
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7548
+ cb(cur, "result_output", -1);
7549
+
7550
+ ggml_build_forward_expand(gf, cur);
7551
+
7552
+ return gf;
7553
+ }
7236
7554
  };
7237
7555
 
7238
7556
  static struct ggml_cgraph * llama_build_graph(
@@ -7289,6 +7607,7 @@ static struct ggml_cgraph * llama_build_graph(
7289
7607
  result = llm.build_refact();
7290
7608
  } break;
7291
7609
  case LLM_ARCH_BERT:
7610
+ case LLM_ARCH_NOMIC_BERT:
7292
7611
  {
7293
7612
  result = llm.build_bert();
7294
7613
  } break;
@@ -7340,6 +7659,10 @@ static struct ggml_cgraph * llama_build_graph(
7340
7659
  {
7341
7660
  result = llm.build_minicpm();
7342
7661
  } break;
7662
+ case LLM_ARCH_GEMMA:
7663
+ {
7664
+ result = llm.build_gemma();
7665
+ } break;
7343
7666
  default:
7344
7667
  GGML_ASSERT(false);
7345
7668
  }
@@ -7404,12 +7727,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7404
7727
  }
7405
7728
  }
7406
7729
 
7407
- {
7408
- assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7409
- float * data = (float *) lctx.inp_sum->data;
7730
+ if (hparams.need_kq_pos) {
7731
+ const int64_t n_kv = kv_self.n;
7732
+
7733
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7410
7734
 
7411
- for (int i = 0; i < batch.n_tokens; ++i) {
7412
- data[i] = 1.0f/float(batch.n_tokens);
7735
+ float * data = (float *) lctx.inp_KQ_pos->data;
7736
+
7737
+ for (int i = 0; i < n_kv; ++i) {
7738
+ data[i] = float(lctx.kv_self.cells[i].pos);
7413
7739
  }
7414
7740
  }
7415
7741
 
@@ -7425,17 +7751,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7425
7751
  }
7426
7752
  }
7427
7753
 
7428
- if (hparams.pooling_layer && cparams.do_pooling) {
7754
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7429
7755
  const int64_t n_tokens = batch.n_tokens;
7430
7756
 
7431
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7432
- float * data = (float *) lctx.inp_sum->data;
7757
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7758
+ float * data = (float *) lctx.inp_mean->data;
7433
7759
 
7434
- memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7760
+ memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7435
7761
 
7762
+ std::vector<uint64_t> sum(n_tokens, 0);
7436
7763
  for (int i = 0; i < n_tokens; ++i) {
7437
7764
  const llama_seq_id seq_id = batch.seq_id[i][0];
7438
- data[seq_id*n_tokens + i] = 1.0f;
7765
+ sum[seq_id] += 1;
7766
+ }
7767
+
7768
+ std::vector<float> div(n_tokens, 0.0f);
7769
+ for (int i = 0; i < n_tokens; ++i) {
7770
+ const uint64_t s = sum[i];
7771
+ if (s > 0) {
7772
+ div[i] = 1.0f/float(s);
7773
+ }
7774
+ }
7775
+
7776
+ for (int i = 0; i < n_tokens; ++i) {
7777
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7778
+ data[seq_id*n_tokens + i] = div[seq_id];
7779
+ }
7780
+ }
7781
+
7782
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7783
+ const int64_t n_tokens = batch.n_tokens;
7784
+
7785
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
7786
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
7787
+
7788
+ for (int i = 0; i < n_tokens; ++i) {
7789
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7790
+ const llama_pos pos = batch.pos[i];
7791
+ if (pos == 0) {
7792
+ data[seq_id] = i;
7793
+ }
7439
7794
  }
7440
7795
  }
7441
7796
  }
@@ -10145,25 +10500,28 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10145
10500
  return std::make_pair(i_layer, n_layer);
10146
10501
  };
10147
10502
 
10148
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
10503
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10504
+ // with the quantization of the output tensor
10505
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10506
+ (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
10149
10507
  int nx = tensor->ne[0];
10150
10508
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10151
10509
  new_type = GGML_TYPE_Q8_0;
10152
10510
  }
10153
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10511
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10154
10512
  new_type = GGML_TYPE_Q5_K;
10155
10513
  }
10156
10514
  else if (new_type != GGML_TYPE_Q8_0) {
10157
10515
  new_type = GGML_TYPE_Q6_K;
10158
10516
  }
10159
10517
  } else if (name == "token_embd.weight") {
10160
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10518
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10161
10519
  new_type = GGML_TYPE_Q2_K;
10162
10520
  }
10163
10521
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10164
10522
  new_type = GGML_TYPE_Q4_K;
10165
10523
  }
10166
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10524
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10167
10525
  if (name.find("attn_v.weight") != std::string::npos) {
10168
10526
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10169
10527
  else new_type = GGML_TYPE_Q2_K;
@@ -10173,6 +10531,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10173
10531
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10174
10532
  ++qs.i_ffn_down;
10175
10533
  }
10534
+ else if (name.find("attn_output.weight") != std::string::npos) {
10535
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10536
+ }
10176
10537
  } else if (name.find("attn_v.weight") != std::string::npos) {
10177
10538
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
10178
10539
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -10187,6 +10548,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10187
10548
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10188
10549
  }
10189
10550
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
10552
+ new_type = GGML_TYPE_Q5_K;
10553
+ }
10190
10554
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
10191
10555
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
10192
10556
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -10239,6 +10603,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10239
10603
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10240
10604
  }
10241
10605
  }
10606
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
10608
+ }
10242
10609
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10243
10610
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
10244
10611
  new_type = GGML_TYPE_Q5_K;
@@ -10255,7 +10622,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10255
10622
  if (arch != LLM_ARCH_FALCON) {
10256
10623
  if (qs.model.hparams.n_expert == 8) {
10257
10624
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10258
- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
10625
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10259
10626
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10260
10627
  new_type = GGML_TYPE_Q5_K;
10261
10628
  }
@@ -10306,7 +10673,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10306
10673
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10307
10674
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10308
10675
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10309
- new_type == GGML_TYPE_IQ3_XXS) {
10676
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10310
10677
  int nx = tensor->ne[0];
10311
10678
  int ny = tensor->ne[1];
10312
10679
  if (nx % QK_K != 0) {
@@ -10321,8 +10688,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10321
10688
  case GGML_TYPE_IQ2_XXS:
10322
10689
  case GGML_TYPE_IQ2_XS:
10323
10690
  case GGML_TYPE_IQ3_XXS:
10324
- case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
10325
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10691
+ case GGML_TYPE_IQ1_S:
10692
+ case GGML_TYPE_Q2_K:
10693
+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10326
10694
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10327
10695
  case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10328
10696
  case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
@@ -10363,6 +10731,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10363
10731
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10364
10732
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10365
10733
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10366
10736
 
10367
10737
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10368
10738
  }
@@ -10536,6 +10906,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10536
10906
  }
10537
10907
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10538
10908
  new_type == GGML_TYPE_IQ2_XS ||
10909
+ new_type == GGML_TYPE_IQ1_S ||
10539
10910
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10540
10911
  LLAMA_LOG_ERROR("\n\n============================================================\n");
10541
10912
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -10770,7 +11141,7 @@ static int llama_apply_lora_from_file_internal(
10770
11141
  {
10771
11142
  LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
10772
11143
  __func__, ftype);
10773
- return false;
11144
+ return 1;
10774
11145
  }
10775
11146
  }
10776
11147
 
@@ -11059,7 +11430,7 @@ bool llama_mlock_supported(void) {
11059
11430
  return llama_supports_mlock();
11060
11431
  }
11061
11432
 
11062
- void llama_backend_init(bool numa) {
11433
+ void llama_backend_init(void) {
11063
11434
  ggml_time_init();
11064
11435
 
11065
11436
  // needed to initialize f16 tables
@@ -11069,15 +11440,17 @@ void llama_backend_init(bool numa) {
11069
11440
  ggml_free(ctx);
11070
11441
  }
11071
11442
 
11072
- if (numa) {
11073
- ggml_numa_init();
11074
- }
11075
-
11076
11443
  #ifdef GGML_USE_MPI
11077
11444
  ggml_mpi_backend_init();
11078
11445
  #endif
11079
11446
  }
11080
11447
 
11448
+ void llama_numa_init(enum ggml_numa_strategy numa) {
11449
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
11450
+ ggml_numa_init(numa);
11451
+ }
11452
+ }
11453
+
11081
11454
  void llama_backend_free(void) {
11082
11455
  #ifdef GGML_USE_MPI
11083
11456
  ggml_mpi_backend_free();
@@ -11309,7 +11682,7 @@ struct llama_context * llama_new_context_with_model(
11309
11682
  // graph inputs
11310
11683
  {
11311
11684
  ggml_init_params init_params = {
11312
- /* .mem_size */ ggml_tensor_overhead()*7,
11685
+ /* .mem_size */ ggml_tensor_overhead()*8,
11313
11686
  /* .mem_buffer */ nullptr,
11314
11687
  /* .no_alloc */ true,
11315
11688
  };
@@ -11319,15 +11692,19 @@ struct llama_context * llama_new_context_with_model(
11319
11692
  ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11320
11693
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11321
11694
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11695
+ ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
11322
11696
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11323
- ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11697
+ ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11698
+ ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11324
11699
 
11325
11700
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
11326
11701
  ggml_set_name(ctx->inp_embd, "inp_embd");
11327
11702
  ggml_set_name(ctx->inp_pos, "inp_pos");
11328
11703
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11704
+ ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
11329
11705
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11330
- ggml_set_name(ctx->inp_sum, "inp_sum");
11706
+ ggml_set_name(ctx->inp_mean, "inp_mean");
11707
+ ggml_set_name(ctx->inp_cls, "inp_cls");
11331
11708
 
11332
11709
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11333
11710
 
@@ -11819,18 +12196,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
11819
12196
  data_ctx->write(&kv_used, sizeof(kv_used));
11820
12197
 
11821
12198
  if (kv_buf_size) {
11822
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11823
-
11824
12199
  std::vector<uint8_t> tmp_buf;
11825
12200
  for (int il = 0; il < (int) n_layer; ++il) {
11826
- tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
12201
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12202
+ tmp_buf.resize(k_size);
11827
12203
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
11828
12204
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11829
12205
 
11830
12206
  // v is not contiguous, copy row by row
11831
- tmp_buf.resize(elt_size*kv_head);
12207
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12209
+ tmp_buf.resize(v_row_size);
11832
12210
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11833
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
12211
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
11834
12212
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11835
12213
  }
11836
12214
  }
@@ -11932,17 +12310,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11932
12310
  if (kv_buf_size) {
11933
12311
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
11934
12312
 
11935
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11936
-
11937
12313
  for (int il = 0; il < (int) n_layer; ++il) {
11938
- size_t k_size = elt_size*n_embd_k_gqa*kv_head;
12314
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
11939
12315
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
11940
12316
  inp += k_size;
11941
12317
 
11942
12318
  // v is not contiguous, copy row by row
11943
- size_t v_row_size = elt_size*kv_head;
12319
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
11944
12321
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11945
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
12322
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
11946
12323
  inp += v_row_size;
11947
12324
  }
11948
12325
  }
@@ -12332,6 +12709,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12332
12709
  return 0;
12333
12710
  }
12334
12711
 
12712
+ // trim whitespace from the beginning and end of a string
12713
+ static std::string trim(const std::string & str) {
12714
+ size_t start = 0;
12715
+ size_t end = str.size();
12716
+ while (start < end && isspace(str[start])) {
12717
+ start += 1;
12718
+ }
12719
+ while (end > start && isspace(str[end - 1])) {
12720
+ end -= 1;
12721
+ }
12722
+ return str.substr(start, end - start);
12723
+ }
12724
+
12725
+ // Simple version of "llama_apply_chat_template" that only works with strings
12726
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
12727
+ static int32_t llama_chat_apply_template_internal(
12728
+ const std::string & tmpl,
12729
+ const std::vector<const llama_chat_message *> & chat,
12730
+ std::string & dest, bool add_ass) {
12731
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12732
+ std::stringstream ss;
12733
+ if (tmpl.find("<|im_start|>") != std::string::npos) {
12734
+ // chatml template
12735
+ for (auto message : chat) {
12736
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
12737
+ }
12738
+ if (add_ass) {
12739
+ ss << "<|im_start|>assistant\n";
12740
+ }
12741
+ } else if (tmpl.find("[INST]") != std::string::npos) {
12742
+ // llama2 template and its variants
12743
+ // [variant] support system message
12744
+ bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
12745
+ // [variant] space before + after response
12746
+ bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
12747
+ // [variant] add BOS inside history
12748
+ bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
12749
+ // [variant] trim spaces from the input message
12750
+ bool strip_message = tmpl.find("content.strip()") != std::string::npos;
12751
+ // construct the prompt
12752
+ bool is_inside_turn = true; // skip BOS at the beginning
12753
+ ss << "[INST] ";
12754
+ for (auto message : chat) {
12755
+ std::string content = strip_message ? trim(message->content) : message->content;
12756
+ std::string role(message->role);
12757
+ if (!is_inside_turn) {
12758
+ is_inside_turn = true;
12759
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
12760
+ }
12761
+ if (role == "system") {
12762
+ if (support_system_message) {
12763
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
12764
+ } else {
12765
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
12766
+ ss << content << "\n";
12767
+ }
12768
+ } else if (role == "user") {
12769
+ ss << content << " [/INST]";
12770
+ } else {
12771
+ ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
12772
+ is_inside_turn = false;
12773
+ }
12774
+ }
12775
+ // llama2 templates seem to not care about "add_generation_prompt"
12776
+ } else if (tmpl.find("<|user|>") != std::string::npos) {
12777
+ // zephyr template
12778
+ for (auto message : chat) {
12779
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
12780
+ }
12781
+ if (add_ass) {
12782
+ ss << "<|assistant|>\n";
12783
+ }
12784
+ } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
12785
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
12786
+ for (auto message : chat) {
12787
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
12788
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
12789
+ }
12790
+ if (add_ass) {
12791
+ ss << "<s>assistant\n";
12792
+ }
12793
+ } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
12794
+ // google/gemma-7b-it
12795
+ std::string system_prompt = "";
12796
+ for (auto message : chat) {
12797
+ std::string role(message->role);
12798
+ if (role == "system") {
12799
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
12800
+ system_prompt = trim(message->content);
12801
+ continue;
12802
+ }
12803
+ // in gemma, "assistant" is "model"
12804
+ role = role == "assistant" ? "model" : message->role;
12805
+ ss << "<start_of_turn>" << role << "\n";
12806
+ if (!system_prompt.empty() && role != "model") {
12807
+ ss << system_prompt << "\n\n";
12808
+ system_prompt = "";
12809
+ }
12810
+ ss << trim(message->content) << "<end_of_turn>\n";
12811
+ }
12812
+ if (add_ass) {
12813
+ ss << "<start_of_turn>model\n";
12814
+ }
12815
+ } else {
12816
+ // template not supported
12817
+ return -1;
12818
+ }
12819
+ dest = ss.str();
12820
+ return dest.size();
12821
+ }
12822
+
12823
+ LLAMA_API int32_t llama_chat_apply_template(
12824
+ const struct llama_model * model,
12825
+ const char * tmpl,
12826
+ const struct llama_chat_message * chat,
12827
+ size_t n_msg,
12828
+ bool add_ass,
12829
+ char * buf,
12830
+ int32_t length) {
12831
+ std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
12832
+ if (tmpl == nullptr) {
12833
+ GGML_ASSERT(model != nullptr);
12834
+ // load template from model
12835
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12836
+ std::string template_key = "tokenizer.chat_template";
12837
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
12838
+ if (res < 0) {
12839
+ // worst case: there is no information about template, we will use chatml by default
12840
+ curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
12841
+ } else {
12842
+ curr_tmpl = std::string(model_template.data(), model_template.size());
12843
+ }
12844
+ }
12845
+ // format the chat to string
12846
+ std::vector<const llama_chat_message *> chat_vec;
12847
+ chat_vec.resize(n_msg);
12848
+ for (size_t i = 0; i < n_msg; i++) {
12849
+ chat_vec[i] = &chat[i];
12850
+ }
12851
+ std::string formatted_chat;
12852
+ int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12853
+ if (res < 0) {
12854
+ return res;
12855
+ }
12856
+ strncpy(buf, formatted_chat.c_str(), length);
12857
+ return res;
12858
+ }
12859
+
12335
12860
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
12336
12861
  struct llama_timings result = {
12337
12862
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,