llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -196,6 +196,8 @@ enum llm_arch {
196
196
  LLM_ARCH_STARCODER,
197
197
  LLM_ARCH_PERSIMMON,
198
198
  LLM_ARCH_REFACT,
199
+ LLM_ARCH_BERT,
200
+ LLM_ARCH_NOMIC_BERT,
199
201
  LLM_ARCH_BLOOM,
200
202
  LLM_ARCH_STABLELM,
201
203
  LLM_ARCH_QWEN,
@@ -206,30 +208,34 @@ enum llm_arch {
206
208
  LLM_ARCH_ORION,
207
209
  LLM_ARCH_INTERNLM2,
208
210
  LLM_ARCH_MINICPM,
211
+ LLM_ARCH_GEMMA,
209
212
  LLM_ARCH_UNKNOWN,
210
213
  };
211
214
 
212
215
  static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
213
- { LLM_ARCH_LLAMA, "llama" },
214
- { LLM_ARCH_FALCON, "falcon" },
215
- { LLM_ARCH_GPT2, "gpt2" },
216
- { LLM_ARCH_GPTJ, "gptj" },
217
- { LLM_ARCH_GPTNEOX, "gptneox" },
218
- { LLM_ARCH_MPT, "mpt" },
219
- { LLM_ARCH_BAICHUAN, "baichuan" },
220
- { LLM_ARCH_STARCODER, "starcoder" },
221
- { LLM_ARCH_PERSIMMON, "persimmon" },
222
- { LLM_ARCH_REFACT, "refact" },
223
- { LLM_ARCH_BLOOM, "bloom" },
224
- { LLM_ARCH_STABLELM, "stablelm" },
225
- { LLM_ARCH_QWEN, "qwen" },
226
- { LLM_ARCH_QWEN2, "qwen2" },
227
- { LLM_ARCH_PHI2, "phi2" },
228
- { LLM_ARCH_PLAMO, "plamo" },
229
- { LLM_ARCH_CODESHELL, "codeshell" },
230
- { LLM_ARCH_ORION, "orion" },
231
- { LLM_ARCH_INTERNLM2, "internlm2" },
232
- { LLM_ARCH_MINICPM, "minicpm" },
216
+ { LLM_ARCH_LLAMA, "llama" },
217
+ { LLM_ARCH_FALCON, "falcon" },
218
+ { LLM_ARCH_GPT2, "gpt2" },
219
+ { LLM_ARCH_GPTJ, "gptj" },
220
+ { LLM_ARCH_GPTNEOX, "gptneox" },
221
+ { LLM_ARCH_MPT, "mpt" },
222
+ { LLM_ARCH_BAICHUAN, "baichuan" },
223
+ { LLM_ARCH_STARCODER, "starcoder" },
224
+ { LLM_ARCH_PERSIMMON, "persimmon" },
225
+ { LLM_ARCH_REFACT, "refact" },
226
+ { LLM_ARCH_BERT, "bert" },
227
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
228
+ { LLM_ARCH_BLOOM, "bloom" },
229
+ { LLM_ARCH_STABLELM, "stablelm" },
230
+ { LLM_ARCH_QWEN, "qwen" },
231
+ { LLM_ARCH_QWEN2, "qwen2" },
232
+ { LLM_ARCH_PHI2, "phi2" },
233
+ { LLM_ARCH_PLAMO, "plamo" },
234
+ { LLM_ARCH_CODESHELL, "codeshell" },
235
+ { LLM_ARCH_ORION, "orion" },
236
+ { LLM_ARCH_INTERNLM2, "internlm2" },
237
+ { LLM_ARCH_MINICPM, "minicpm" },
238
+ { LLM_ARCH_GEMMA, "gemma" },
233
239
  };
234
240
 
235
241
  enum llm_kv {
@@ -252,6 +258,7 @@ enum llm_kv {
252
258
  LLM_KV_TENSOR_DATA_LAYOUT,
253
259
  LLM_KV_EXPERT_COUNT,
254
260
  LLM_KV_EXPERT_USED_COUNT,
261
+ LLM_KV_POOLING_TYPE,
255
262
 
256
263
  LLM_KV_ATTENTION_HEAD_COUNT,
257
264
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -261,6 +268,7 @@ enum llm_kv {
261
268
  LLM_KV_ATTENTION_VALUE_LENGTH,
262
269
  LLM_KV_ATTENTION_LAYERNORM_EPS,
263
270
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
271
+ LLM_KV_ATTENTION_CAUSAL,
264
272
 
265
273
  LLM_KV_ROPE_DIMENSION_COUNT,
266
274
  LLM_KV_ROPE_FREQ_BASE,
@@ -273,6 +281,7 @@ enum llm_kv {
273
281
  LLM_KV_TOKENIZER_MODEL,
274
282
  LLM_KV_TOKENIZER_LIST,
275
283
  LLM_KV_TOKENIZER_TOKEN_TYPE,
284
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
276
285
  LLM_KV_TOKENIZER_SCORES,
277
286
  LLM_KV_TOKENIZER_MERGES,
278
287
  LLM_KV_TOKENIZER_BOS_ID,
@@ -307,6 +316,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
307
316
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
308
317
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
309
318
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
319
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
310
320
 
311
321
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
312
322
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -316,6 +326,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
316
326
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
317
327
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
318
328
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
329
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
319
330
 
320
331
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
321
332
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -328,6 +339,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
328
339
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
329
340
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
330
341
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
342
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
331
343
  { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
332
344
  { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
333
345
  { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
@@ -355,6 +367,7 @@ struct LLM_KV {
355
367
  enum llm_tensor {
356
368
  LLM_TENSOR_TOKEN_EMBD,
357
369
  LLM_TENSOR_TOKEN_EMBD_NORM,
370
+ LLM_TENSOR_TOKEN_TYPES,
358
371
  LLM_TENSOR_POS_EMBD,
359
372
  LLM_TENSOR_OUTPUT,
360
373
  LLM_TENSOR_OUTPUT_NORM,
@@ -366,6 +379,7 @@ enum llm_tensor {
366
379
  LLM_TENSOR_ATTN_OUT,
367
380
  LLM_TENSOR_ATTN_NORM,
368
381
  LLM_TENSOR_ATTN_NORM_2,
382
+ LLM_TENSOR_ATTN_OUT_NORM,
369
383
  LLM_TENSOR_ATTN_ROT_EMBD,
370
384
  LLM_TENSOR_FFN_GATE_INP,
371
385
  LLM_TENSOR_FFN_NORM,
@@ -378,6 +392,7 @@ enum llm_tensor {
378
392
  LLM_TENSOR_FFN_UP_EXP,
379
393
  LLM_TENSOR_ATTN_Q_NORM,
380
394
  LLM_TENSOR_ATTN_K_NORM,
395
+ LLM_TENSOR_LAYER_OUT_NORM,
381
396
  };
382
397
 
383
398
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -494,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
494
509
  {
495
510
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
496
511
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
497
- { LLM_TENSOR_OUTPUT, "output" },
498
512
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
499
513
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
500
514
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -536,6 +550,38 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
536
550
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
537
551
  },
538
552
  },
553
+ {
554
+ LLM_ARCH_BERT,
555
+ {
556
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
557
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
558
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
559
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
560
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
561
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
562
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
563
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
564
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
565
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
566
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
567
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
+ },
569
+ },
570
+ {
571
+ LLM_ARCH_NOMIC_BERT,
572
+ {
573
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
574
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
575
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
576
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
577
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
578
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
579
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
580
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
581
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
582
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
583
+ },
584
+ },
539
585
  {
540
586
  LLM_ARCH_BLOOM,
541
587
  {
@@ -715,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
715
761
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
716
762
  },
717
763
  },
764
+ {
765
+ LLM_ARCH_GEMMA,
766
+ {
767
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
768
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
769
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
770
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
771
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
772
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
773
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
774
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
775
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
776
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
777
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
778
+ },
779
+ },
718
780
  {
719
781
  LLM_ARCH_UNKNOWN,
720
782
  {
@@ -748,22 +810,37 @@ struct LLM_TN {
748
810
  llm_arch arch;
749
811
 
750
812
  std::string operator()(llm_tensor tensor) const {
813
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
814
+ return "__missing__";
815
+ }
751
816
  return LLM_TENSOR_NAMES[arch].at(tensor);
752
817
  }
753
818
 
754
819
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
820
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
821
+ return "__missing__";
822
+ }
755
823
  return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
756
824
  }
757
825
 
758
826
  std::string operator()(llm_tensor tensor, int bid) const {
827
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
828
+ return "__missing__";
829
+ }
759
830
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
760
831
  }
761
832
 
762
833
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
834
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
835
+ return "__missing__";
836
+ }
763
837
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
764
838
  }
765
839
 
766
840
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
841
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
842
+ return "__missing__";
843
+ }
767
844
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
768
845
  }
769
846
  };
@@ -974,7 +1051,7 @@ struct llama_mmap {
974
1051
  int fd = fileno(file->fp);
975
1052
  int flags = MAP_SHARED;
976
1053
  // prefetch/readahead impairs performance on NUMA systems
977
- if (numa) { prefetch = 0; }
1054
+ if (numa) { prefetch = 0; }
978
1055
  #ifdef __linux__
979
1056
  // advise the kernel to read the file sequentially (increases readahead)
980
1057
  if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -1440,6 +1517,12 @@ static llama_state g_state;
1440
1517
  // available llama models
1441
1518
  enum e_model {
1442
1519
  MODEL_UNKNOWN,
1520
+ MODEL_17M,
1521
+ MODEL_22M,
1522
+ MODEL_33M,
1523
+ MODEL_109M,
1524
+ MODEL_137M,
1525
+ MODEL_335M,
1443
1526
  MODEL_0_5B,
1444
1527
  MODEL_1B,
1445
1528
  MODEL_2B,
@@ -1481,6 +1564,7 @@ struct llama_hparams {
1481
1564
  uint32_t n_ff;
1482
1565
  uint32_t n_expert = 0;
1483
1566
  uint32_t n_expert_used = 0;
1567
+ uint32_t n_vocab_type = 0; // for BERT-style token types
1484
1568
 
1485
1569
  float f_norm_eps;
1486
1570
  float f_norm_rms_eps;
@@ -1490,9 +1574,13 @@ struct llama_hparams {
1490
1574
  uint32_t n_yarn_orig_ctx;
1491
1575
  int32_t rope_scaling_type_train;
1492
1576
 
1493
- float f_clamp_kqv;
1494
- float f_max_alibi_bias;
1577
+ float f_clamp_kqv = 0.0f;
1578
+ float f_max_alibi_bias = 0.0f;
1579
+
1580
+ bool causal_attn = true;
1581
+ bool need_kq_pos = false;
1495
1582
 
1583
+ uint32_t pooling_type = LLAMA_POOLING_NONE;
1496
1584
 
1497
1585
  bool operator!=(const llama_hparams & other) const {
1498
1586
  if (this->vocab_only != other.vocab_only) return true;
@@ -1554,6 +1642,7 @@ struct llama_cparams {
1554
1642
 
1555
1643
  bool mul_mat_q;
1556
1644
  bool offload_kqv;
1645
+ bool do_pooling;
1557
1646
 
1558
1647
  ggml_backend_sched_eval_callback cb_eval;
1559
1648
  void * cb_eval_user_data;
@@ -1569,6 +1658,8 @@ struct llama_layer {
1569
1658
  struct ggml_tensor * attn_q_norm_b;
1570
1659
  struct ggml_tensor * attn_k_norm;
1571
1660
  struct ggml_tensor * attn_k_norm_b;
1661
+ struct ggml_tensor * attn_out_norm;
1662
+ struct ggml_tensor * attn_out_norm_b;
1572
1663
 
1573
1664
  // attention
1574
1665
  struct ggml_tensor * wq;
@@ -1587,6 +1678,8 @@ struct llama_layer {
1587
1678
  // normalization
1588
1679
  struct ggml_tensor * ffn_norm;
1589
1680
  struct ggml_tensor * ffn_norm_b;
1681
+ struct ggml_tensor * layer_out_norm;
1682
+ struct ggml_tensor * layer_out_norm_b;
1590
1683
 
1591
1684
  // ff
1592
1685
  struct ggml_tensor * ffn_gate; // w1
@@ -1720,6 +1813,7 @@ struct llama_model {
1720
1813
  llama_vocab vocab;
1721
1814
 
1722
1815
  struct ggml_tensor * tok_embd;
1816
+ struct ggml_tensor * type_embd;
1723
1817
  struct ggml_tensor * pos_embd;
1724
1818
  struct ggml_tensor * tok_norm;
1725
1819
  struct ggml_tensor * tok_norm_b;
@@ -1839,8 +1933,6 @@ struct llama_context {
1839
1933
  // memory buffers used to evaluate the model
1840
1934
  std::vector<uint8_t> buf_compute_meta;
1841
1935
  ggml_backend_sched_t sched = nullptr;
1842
- // allocator for the input tensors
1843
- ggml_tallocr * alloc = nullptr;
1844
1936
 
1845
1937
  // input tensors
1846
1938
  ggml_backend_buffer_t buf_input = nullptr;
@@ -1849,7 +1941,10 @@ struct llama_context {
1849
1941
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1850
1942
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1851
1943
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1944
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1852
1945
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1946
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1947
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
1853
1948
 
1854
1949
  #ifdef GGML_USE_MPI
1855
1950
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2448,6 +2543,8 @@ struct llama_model_loader {
2448
2543
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2449
2544
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2450
2545
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
+ case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2451
2548
  default:
2452
2549
  {
2453
2550
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2693,13 +2790,7 @@ struct llama_model_loader {
2693
2790
 
2694
2791
  std::vector<no_init<uint8_t>> read_buf;
2695
2792
 
2696
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2697
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2698
- if (!cur) {
2699
- // some tensors may be allocated in a different context
2700
- continue;
2701
- }
2702
-
2793
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
2703
2794
  if (progress_callback) {
2704
2795
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2705
2796
  return false;
@@ -2797,6 +2888,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2797
2888
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2798
2889
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2799
2890
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
+ case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2800
2893
 
2801
2894
  default: return "unknown, may not work";
2802
2895
  }
@@ -2804,6 +2897,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2804
2897
 
2805
2898
  static const char * llama_model_type_name(e_model type) {
2806
2899
  switch (type) {
2900
+ case MODEL_22M: return "22M";
2901
+ case MODEL_33M: return "33M";
2902
+ case MODEL_109M: return "109M";
2903
+ case MODEL_137M: return "137M";
2904
+ case MODEL_0_5B: return "0.5B";
2807
2905
  case MODEL_1B: return "1B";
2808
2906
  case MODEL_2B: return "2B";
2809
2907
  case MODEL_3B: return "3B";
@@ -2829,6 +2927,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2829
2927
  switch (type) {
2830
2928
  case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2831
2929
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2930
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2832
2931
  default: return "unknown";
2833
2932
  }
2834
2933
  }
@@ -2972,6 +3071,11 @@ static void llm_load_hparams(
2972
3071
  case 40: model.type = e_model::MODEL_13B; break;
2973
3072
  default: model.type = e_model::MODEL_UNKNOWN;
2974
3073
  }
3074
+
3075
+ if (model.type == e_model::MODEL_13B) {
3076
+ // TODO: become GGUF KV parameter
3077
+ hparams.f_max_alibi_bias = 8.0f;
3078
+ }
2975
3079
  } break;
2976
3080
  case LLM_ARCH_STARCODER:
2977
3081
  {
@@ -2999,6 +3103,41 @@ static void llm_load_hparams(
2999
3103
  case 32: model.type = e_model::MODEL_1B; break;
3000
3104
  default: model.type = e_model::MODEL_UNKNOWN;
3001
3105
  }
3106
+
3107
+ // TODO: become GGUF KV parameter
3108
+ hparams.f_max_alibi_bias = 8.0f;
3109
+ } break;
3110
+ case LLM_ARCH_BERT:
3111
+ {
3112
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3113
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3114
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3115
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3116
+
3117
+ switch (hparams.n_layer) {
3118
+ case 3:
3119
+ model.type = e_model::MODEL_17M; break; // bge-micro
3120
+ case 6:
3121
+ model.type = e_model::MODEL_22M; break; // MiniLM-L6
3122
+ case 12:
3123
+ switch (hparams.n_embd) {
3124
+ case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
3125
+ case 768: model.type = e_model::MODEL_109M; break; // bge-base
3126
+ } break;
3127
+ case 24:
3128
+ model.type = e_model::MODEL_335M; break; // bge-large
3129
+ }
3130
+ } break;
3131
+ case LLM_ARCH_NOMIC_BERT:
3132
+ {
3133
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
+
3138
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
+ model.type = e_model::MODEL_137M;
3140
+ }
3002
3141
  } break;
3003
3142
  case LLM_ARCH_BLOOM:
3004
3143
  {
@@ -3012,11 +3151,12 @@ static void llm_load_hparams(
3012
3151
  case 4096: model.type = e_model::MODEL_7B; break;
3013
3152
  } break;
3014
3153
  }
3154
+
3155
+ // TODO: become GGUF KV parameter
3156
+ hparams.f_max_alibi_bias = 8.0f;
3015
3157
  } break;
3016
3158
  case LLM_ARCH_MPT:
3017
3159
  {
3018
- hparams.f_clamp_kqv = 0.0f;
3019
-
3020
3160
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3021
3161
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
3022
3162
  ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
@@ -3114,10 +3254,24 @@ static void llm_load_hparams(
3114
3254
  default: model.type = e_model::MODEL_UNKNOWN;
3115
3255
  }
3116
3256
  } break;
3257
+ case LLM_ARCH_GEMMA:
3258
+ {
3259
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3260
+
3261
+ switch (hparams.n_layer) {
3262
+ case 18: model.type = e_model::MODEL_2B; break;
3263
+ case 28: model.type = e_model::MODEL_7B; break;
3264
+ default: model.type = e_model::MODEL_UNKNOWN;
3265
+ }
3266
+ } break;
3117
3267
  default: (void)0;
3118
3268
  }
3119
3269
 
3120
3270
  model.ftype = ml.ftype;
3271
+
3272
+ if (hparams.f_max_alibi_bias > 0.0f) {
3273
+ hparams.need_kq_pos = true;
3274
+ }
3121
3275
  }
3122
3276
 
3123
3277
  // TODO: This should probably be in llama.h
@@ -3204,6 +3358,16 @@ static void llm_load_vocab(
3204
3358
  vocab.special_unk_id = -1;
3205
3359
  vocab.special_sep_id = -1;
3206
3360
  vocab.special_pad_id = -1;
3361
+ } else if (tokenizer_name == "bert") {
3362
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
3363
+
3364
+ // default special tokens
3365
+ vocab.special_bos_id = 101;
3366
+ vocab.special_eos_id = 102;
3367
+ vocab.special_unk_id = 100;
3368
+ vocab.special_sep_id = -1;
3369
+ vocab.special_pad_id = -1;
3370
+ vocab.add_space_prefix = false;
3207
3371
  } else {
3208
3372
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
3209
3373
  LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -3231,7 +3395,14 @@ static void llm_load_vocab(
3231
3395
 
3232
3396
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3233
3397
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3234
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3398
+ try {
3399
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3400
+ } catch (const std::exception & e) {
3401
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
3402
+ vocab.linefeed_id = vocab.special_pad_id;
3403
+ }
3404
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3405
+ vocab.linefeed_id = vocab.special_pad_id;
3235
3406
  } else {
3236
3407
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
3237
3408
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -3544,7 +3715,7 @@ static bool llm_load_tensors(
3544
3715
  }
3545
3716
 
3546
3717
  // create one context per buffer type
3547
- size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3718
+ size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
3548
3719
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3549
3720
  for (auto & it : buft_layer_count) {
3550
3721
  struct ggml_init_params params = {
@@ -3569,6 +3740,7 @@ static bool llm_load_tensors(
3569
3740
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3570
3741
  const int64_t n_embd_gqa = n_embd_v_gqa;
3571
3742
  const int64_t n_vocab = hparams.n_vocab;
3743
+ const int64_t n_vocab_type = hparams.n_vocab_type;
3572
3744
  const int64_t n_ff = hparams.n_ff;
3573
3745
 
3574
3746
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
@@ -3681,6 +3853,7 @@ static bool llm_load_tensors(
3681
3853
  } else {
3682
3854
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3683
3855
  ml.n_created--; // artificial tensor
3856
+ ml.size_data += ggml_nbytes(model.output);
3684
3857
  }
3685
3858
  }
3686
3859
 
@@ -3783,11 +3956,63 @@ static bool llm_load_tensors(
3783
3956
  layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3784
3957
  }
3785
3958
  } break;
3959
+ case LLM_ARCH_BERT:
3960
+ case LLM_ARCH_NOMIC_BERT:
3961
+ {
3962
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3963
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3964
+ if (model.arch == LLM_ARCH_BERT) {
3965
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3966
+ }
3967
+
3968
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3969
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3970
+
3971
+ for (int i = 0; i < n_layer; ++i) {
3972
+ ggml_context * ctx_layer = ctx_for_layer(i);
3973
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3974
+
3975
+ auto & layer = model.layers[i];
3976
+
3977
+ if (model.arch == LLM_ARCH_BERT) {
3978
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3979
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3980
+
3981
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3982
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3983
+
3984
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3985
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3986
+ } else {
3987
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3988
+ }
3989
+
3990
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3991
+
3992
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
3993
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
3994
+
3995
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3996
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3997
+
3998
+ if (model.arch == LLM_ARCH_BERT) {
3999
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4000
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
4001
+
4002
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4003
+ } else {
4004
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4005
+ }
4006
+
4007
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4008
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
4009
+ }
4010
+ } break;
3786
4011
  case LLM_ARCH_BLOOM:
3787
4012
  {
3788
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3789
- model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3790
- model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
4013
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4014
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
4015
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3791
4016
 
3792
4017
  // output
3793
4018
  {
@@ -3828,7 +4053,12 @@ static bool llm_load_tensors(
3828
4053
  // output
3829
4054
  {
3830
4055
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3831
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4056
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4057
+
4058
+ // same as tok_embd, duplicated to allow offloading
4059
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4060
+ ml.n_created--; // artificial tensor
4061
+ ml.size_data += ggml_nbytes(model.output);
3832
4062
  }
3833
4063
 
3834
4064
  for (int i = 0; i < n_layer; ++i) {
@@ -3837,14 +4067,23 @@ static bool llm_load_tensors(
3837
4067
 
3838
4068
  auto & layer = model.layers[i];
3839
4069
 
3840
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4070
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4071
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
3841
4072
 
3842
4073
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4074
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
4075
+
3843
4076
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4077
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
3844
4078
 
3845
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3846
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3847
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4079
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4080
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4081
+
4082
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4083
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
4084
+
4085
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4086
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
3848
4087
 
3849
4088
  // AWQ ScaleActivation layer
3850
4089
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -4157,6 +4396,40 @@ static bool llm_load_tensors(
4157
4396
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4158
4397
  }
4159
4398
  } break;
4399
+ case LLM_ARCH_GEMMA:
4400
+ {
4401
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4402
+
4403
+ // output
4404
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4405
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
4406
+ ml.n_created--; // artificial tensor
4407
+ ml.size_data += ggml_nbytes(model.output);
4408
+
4409
+ const int64_t n_ff = hparams.n_ff;
4410
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4411
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4412
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4413
+
4414
+ for (uint32_t i = 0; i < n_layer; ++i) {
4415
+ ggml_context * ctx_layer = ctx_for_layer(i);
4416
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4417
+
4418
+ auto & layer = model.layers[i];
4419
+
4420
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4421
+
4422
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
4423
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
4424
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
4425
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
4426
+
4427
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4428
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4429
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4430
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4431
+ }
4432
+ } break;
4160
4433
  default:
4161
4434
  throw std::runtime_error("unknown architecture");
4162
4435
  }
@@ -4259,9 +4532,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4259
4532
 
4260
4533
  model.hparams.vocab_only = params.vocab_only;
4261
4534
 
4262
- llm_load_arch (ml, model);
4263
- llm_load_hparams(ml, model);
4264
- llm_load_vocab (ml, model);
4535
+ try {
4536
+ llm_load_arch(ml, model);
4537
+ } catch(const std::exception & e) {
4538
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
4539
+ }
4540
+ try {
4541
+ llm_load_hparams(ml, model);
4542
+ } catch(const std::exception & e) {
4543
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
4544
+ }
4545
+ try {
4546
+ llm_load_vocab(ml, model);
4547
+ } catch(const std::exception & e) {
4548
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
4549
+ }
4265
4550
 
4266
4551
  llm_load_print_meta(ml, model);
4267
4552
 
@@ -4578,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
4578
4863
  struct ggml_tensor * wo_b,
4579
4864
  struct ggml_tensor * q_cur,
4580
4865
  struct ggml_tensor * kq_mask,
4866
+ struct ggml_tensor * kq_pos,
4581
4867
  int64_t n_ctx,
4582
4868
  int32_t n_tokens,
4583
4869
  int32_t n_kv,
4584
- float max_alibi_bias,
4585
4870
  float kq_scale,
4586
4871
  const llm_build_cb & cb,
4587
4872
  int il) {
@@ -4611,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
4611
4896
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4612
4897
  }
4613
4898
 
4614
- if (max_alibi_bias > 0.0f) {
4615
- // temporary branch until we figure out how to handle ggml_alibi through ggml_add
4899
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
4901
+ #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
+ if (hparams.f_max_alibi_bias > 0.0f) {
4616
4904
  kq = ggml_scale(ctx, kq, kq_scale);
4617
4905
  cb(kq, "kq_scaled", il);
4618
4906
 
4619
- if (max_alibi_bias > 0.0f) {
4620
- // TODO: n_head or n_head_kv
4621
- // TODO: K-shift is likely not working
4622
- // TODO: change to ggml_add
4623
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
4624
- cb(kq, "kq_scaled_alibi", il);
4625
- }
4907
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
4908
+ cb(kq, "kq_scaled_alibi", il);
4626
4909
 
4627
4910
  kq = ggml_add(ctx, kq, kq_mask);
4628
4911
  cb(kq, "kq_masked", il);
4629
4912
 
4630
4913
  kq = ggml_soft_max(ctx, kq);
4631
4914
  cb(kq, "kq_soft_max", il);
4632
- } else {
4633
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
4915
+ } else
4916
+ #endif
4917
+ {
4918
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
4634
4919
  cb(kq, "kq_soft_max_ext", il);
4635
4920
  }
4636
4921
 
@@ -4678,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
4678
4963
  struct ggml_tensor * v_cur,
4679
4964
  struct ggml_tensor * q_cur,
4680
4965
  struct ggml_tensor * kq_mask,
4966
+ struct ggml_tensor * kq_pos,
4681
4967
  int64_t n_ctx,
4682
4968
  int32_t n_tokens,
4683
4969
  int32_t kv_head,
4684
4970
  int32_t n_kv,
4685
- float max_alibi_bias,
4686
4971
  float kq_scale,
4687
4972
  const llm_build_cb & cb,
4688
4973
  int il) {
@@ -4696,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
4696
4981
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4697
4982
 
4698
4983
  struct ggml_tensor * cur;
4699
- cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4700
- wo, wo_b,
4701
- q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4984
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4985
+ q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4702
4986
  cb(cur, "kqv_out", il);
4703
4987
 
4704
4988
  return cur;
@@ -4739,6 +5023,7 @@ struct llm_build_context {
4739
5023
  const int32_t n_orig_ctx;
4740
5024
 
4741
5025
  const bool do_rope_shift;
5026
+ const uint32_t pooling_type;
4742
5027
 
4743
5028
  const llm_build_cb & cb;
4744
5029
 
@@ -4782,6 +5067,7 @@ struct llm_build_context {
4782
5067
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4783
5068
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4784
5069
  do_rope_shift (worst_case || kv_self.has_shift),
5070
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
4785
5071
  cb (cb),
4786
5072
  buf_compute_meta (lctx.buf_compute_meta) {
4787
5073
  // all initializations should be done in init()
@@ -4864,7 +5150,7 @@ struct llm_build_context {
4864
5150
  }
4865
5151
 
4866
5152
  Qcur = ggml_rope_custom(
4867
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5153
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4868
5154
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4869
5155
  ext_factor, attn_factor, beta_fast, beta_slow
4870
5156
  );
@@ -4879,7 +5165,7 @@ struct llm_build_context {
4879
5165
 
4880
5166
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4881
5167
  model.layers[il].wo, model.layers[il].bo,
4882
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5168
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4883
5169
  cb(cur, "kqv_out", il);
4884
5170
  }
4885
5171
 
@@ -5009,6 +5295,10 @@ struct llm_build_context {
5009
5295
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5010
5296
  cb(KQ_mask, "KQ_mask", -1);
5011
5297
 
5298
+ // positions of the tokens in the KV cache
5299
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
+ cb(KQ_pos, "KQ_pos", -1);
5301
+
5012
5302
  // shift the entire K-cache if needed
5013
5303
  if (do_rope_shift) {
5014
5304
  llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
@@ -5057,12 +5347,9 @@ struct llm_build_context {
5057
5347
  cb(Kcur, "Kcur", il);
5058
5348
 
5059
5349
 
5060
- // apply ALiBi for 13B model
5061
- const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
5062
-
5063
5350
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5064
5351
  model.layers[il].wo, NULL,
5065
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5352
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5066
5353
  cb(cur, "kqv_out", il);
5067
5354
  }
5068
5355
 
@@ -5186,7 +5473,7 @@ struct llm_build_context {
5186
5473
 
5187
5474
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5188
5475
  model.layers[il].wo, NULL,
5189
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5476
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5190
5477
  cb(cur, "kqv_out", il);
5191
5478
  }
5192
5479
 
@@ -5285,7 +5572,7 @@ struct llm_build_context {
5285
5572
 
5286
5573
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5287
5574
  model.layers[il].wo, model.layers[il].bo,
5288
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5575
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5289
5576
  cb(cur, "kqv_out", il);
5290
5577
  }
5291
5578
 
@@ -5490,7 +5777,7 @@ struct llm_build_context {
5490
5777
 
5491
5778
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5492
5779
  model.layers[il].wo, model.layers[il].bo,
5493
- Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5780
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5494
5781
  cb(cur, "kqv_out", il);
5495
5782
  }
5496
5783
 
@@ -5552,6 +5839,10 @@ struct llm_build_context {
5552
5839
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5553
5840
  cb(KQ_mask, "KQ_mask", -1);
5554
5841
 
5842
+ // positions of the tokens in the KV cache
5843
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5844
+ cb(KQ_pos, "KQ_pos", -1);
5845
+
5555
5846
  for (int il = 0; il < n_layer; ++il) {
5556
5847
  struct ggml_tensor * inpSA = inpL;
5557
5848
 
@@ -5579,7 +5870,7 @@ struct llm_build_context {
5579
5870
 
5580
5871
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5581
5872
  model.layers[il].wo, NULL,
5582
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5873
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5583
5874
  cb(cur, "kqv_out", il);
5584
5875
  }
5585
5876
 
@@ -5625,7 +5916,7 @@ struct llm_build_context {
5625
5916
  return gf;
5626
5917
  }
5627
5918
 
5628
- struct ggml_cgraph * build_bloom() {
5919
+ struct ggml_cgraph * build_bert() {
5629
5920
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5630
5921
 
5631
5922
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -5635,34 +5926,58 @@ struct llm_build_context {
5635
5926
  struct ggml_tensor * cur;
5636
5927
  struct ggml_tensor * inpL;
5637
5928
 
5929
+ // get input vectors with right size
5930
+ const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5931
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5932
+ struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5933
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5934
+
5935
+ // construct input embeddings (token, type, position)
5638
5936
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5937
+
5938
+ // token types are hardcoded to zero ("Sentence A")
5939
+ struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5940
+ inpL = ggml_add(ctx0, inpL, type_row0);
5941
+ if (model.arch == LLM_ARCH_BERT) {
5942
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5943
+ }
5639
5944
  cb(inpL, "inp_embd", -1);
5640
5945
 
5946
+ // embed layer norm
5947
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
5948
+ cb(inpL, "inp_norm", -1);
5949
+
5641
5950
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5642
5951
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5643
- cb(KQ_mask, "KQ_mask", -1);
5644
-
5645
- inpL = llm_build_norm(ctx0, inpL, hparams,
5646
- model.tok_norm,
5647
- model.tok_norm_b,
5648
- LLM_NORM, cb, -1);
5649
- cb(inpL, "inp_norm", -1);
5952
+ cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
5650
5953
 
5954
+ // iterate layers
5651
5955
  for (int il = 0; il < n_layer; ++il) {
5652
- cur = llm_build_norm(ctx0, inpL, hparams,
5653
- model.layers[il].attn_norm,
5654
- model.layers[il].attn_norm_b,
5655
- LLM_NORM, cb, il);
5656
- cb(cur, "attn_norm", il);
5956
+ struct ggml_tensor * cur = inpL;
5657
5957
 
5658
5958
  // self-attention
5659
- {
5959
+ if (model.arch == LLM_ARCH_BERT) {
5960
+ struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5961
+ cb(Qcur, "Qcur", il);
5962
+
5963
+ struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5964
+ cb(Kcur, "Kcur", il);
5965
+
5966
+ struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5967
+ cb(Vcur, "Vcur", il);
5968
+
5969
+ // seems like we just need to do this for Q?
5970
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5971
+
5972
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5973
+ model.layers[il].wo, model.layers[il].bo,
5974
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5975
+ cb(cur, "kqv_out", il);
5976
+ } else {
5977
+ // compute Q and K and RoPE them
5660
5978
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5661
5979
  cb(cur, "wqkv", il);
5662
5980
 
5663
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5664
- cb(cur, "bqkv", il);
5665
-
5666
5981
  struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5667
5982
  struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5668
5983
  struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5671,54 +5986,82 @@ struct llm_build_context {
5671
5986
  cb(Kcur, "Kcur", il);
5672
5987
  cb(Vcur, "Vcur", il);
5673
5988
 
5674
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5989
+ Qcur = ggml_rope_custom(
5990
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5992
+ ext_factor, attn_factor, beta_fast, beta_slow
5993
+ );
5994
+ cb(Qcur, "Qcur", il);
5995
+
5996
+ Kcur = ggml_rope_custom(
5997
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5999
+ ext_factor, attn_factor, beta_fast, beta_slow
6000
+ );
6001
+ cb(Kcur, "Kcur", il);
5675
6002
 
5676
6003
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5677
6004
  model.layers[il].wo, model.layers[il].bo,
5678
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6005
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5679
6006
  cb(cur, "kqv_out", il);
5680
6007
  }
5681
6008
 
5682
- // Add the input
5683
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
5684
- cb(ffn_inp, "ffn_inp", il);
6009
+ // re-add the layer input
6010
+ cur = ggml_add(ctx0, cur, inpL);
5685
6011
 
5686
- // FF
5687
- {
5688
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
5689
- model.layers[il].ffn_norm,
5690
- model.layers[il].ffn_norm_b,
5691
- LLM_NORM, cb, il);
5692
- cb(cur, "ffn_norm", il);
6012
+ // attention layer norm
6013
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
6014
+
6015
+ struct ggml_tensor * ffn_inp = cur;
6016
+ cb(ffn_inp, "ffn_inp", il);
5693
6017
 
6018
+ // feed-forward network
6019
+ if (model.arch == LLM_ARCH_BERT) {
5694
6020
  cur = llm_build_ffn(ctx0, cur,
5695
6021
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5696
6022
  NULL, NULL,
5697
6023
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5698
6024
  NULL,
5699
6025
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5700
- cb(cur, "ffn_out", il);
6026
+ } else {
6027
+ cur = llm_build_ffn(ctx0, cur,
6028
+ model.layers[il].ffn_up, NULL,
6029
+ model.layers[il].ffn_gate, NULL,
6030
+ model.layers[il].ffn_down, NULL,
6031
+ NULL,
6032
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5701
6033
  }
6034
+ cb(cur, "ffn_out", il);
5702
6035
 
5703
- inpL = ggml_add(ctx0, cur, ffn_inp);
5704
- cb(inpL, "l_out", il);
6036
+ // attentions bypass the intermediate layer
6037
+ cur = ggml_add(ctx0, cur, ffn_inp);
6038
+
6039
+ // output layer norm
6040
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
6041
+
6042
+ // input for next layer
6043
+ inpL = cur;
5705
6044
  }
5706
6045
 
5707
- cur = llm_build_norm(ctx0, inpL, hparams,
5708
- model.output_norm,
5709
- model.output_norm_b,
5710
- LLM_NORM, cb, -1);
5711
- cb(cur, "result_norm", -1);
6046
+ // final output
6047
+ cur = inpL;
5712
6048
 
5713
- cur = ggml_mul_mat(ctx0, model.output, cur);
5714
- cb(cur, "result_output", -1);
6049
+ // pooling layer
6050
+ if (pooling_type == LLAMA_POOLING_MEAN) {
6051
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
+ } else if (pooling_type == LLAMA_POOLING_CLS) {
6053
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
+ } else {
6055
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6056
+ }
6057
+ cb(cur, "result_embd", -1);
5715
6058
 
5716
6059
  ggml_build_forward_expand(gf, cur);
5717
6060
 
5718
6061
  return gf;
5719
6062
  }
5720
6063
 
5721
- struct ggml_cgraph * build_mpt() {
6064
+ struct ggml_cgraph * build_bloom() {
5722
6065
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5723
6066
 
5724
6067
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -5735,14 +6078,115 @@ struct llm_build_context {
5735
6078
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5736
6079
  cb(KQ_mask, "KQ_mask", -1);
5737
6080
 
5738
- for (int il = 0; il < n_layer; ++il) {
5739
- struct ggml_tensor * attn_norm;
6081
+ // positions of the tokens in the KV cache
6082
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6083
+ cb(KQ_pos, "KQ_pos", -1);
5740
6084
 
5741
- attn_norm = llm_build_norm(ctx0, inpL, hparams,
6085
+ inpL = llm_build_norm(ctx0, inpL, hparams,
6086
+ model.tok_norm,
6087
+ model.tok_norm_b,
6088
+ LLM_NORM, cb, -1);
6089
+ cb(inpL, "inp_norm", -1);
6090
+
6091
+ for (int il = 0; il < n_layer; ++il) {
6092
+ cur = llm_build_norm(ctx0, inpL, hparams,
5742
6093
  model.layers[il].attn_norm,
5743
- NULL,
6094
+ model.layers[il].attn_norm_b,
5744
6095
  LLM_NORM, cb, il);
5745
- cb(attn_norm, "attn_norm", il);
6096
+ cb(cur, "attn_norm", il);
6097
+
6098
+ // self-attention
6099
+ {
6100
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6101
+ cb(cur, "wqkv", il);
6102
+
6103
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6104
+ cb(cur, "bqkv", il);
6105
+
6106
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6107
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6108
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6109
+
6110
+ cb(Qcur, "Qcur", il);
6111
+ cb(Kcur, "Kcur", il);
6112
+ cb(Vcur, "Vcur", il);
6113
+
6114
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6115
+
6116
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6117
+ model.layers[il].wo, model.layers[il].bo,
6118
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6119
+ cb(cur, "kqv_out", il);
6120
+ }
6121
+
6122
+ // Add the input
6123
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6124
+ cb(ffn_inp, "ffn_inp", il);
6125
+
6126
+ // FF
6127
+ {
6128
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6129
+ model.layers[il].ffn_norm,
6130
+ model.layers[il].ffn_norm_b,
6131
+ LLM_NORM, cb, il);
6132
+ cb(cur, "ffn_norm", il);
6133
+
6134
+ cur = llm_build_ffn(ctx0, cur,
6135
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6136
+ NULL, NULL,
6137
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6138
+ NULL,
6139
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6140
+ cb(cur, "ffn_out", il);
6141
+ }
6142
+
6143
+ inpL = ggml_add(ctx0, cur, ffn_inp);
6144
+ cb(inpL, "l_out", il);
6145
+ }
6146
+
6147
+ cur = llm_build_norm(ctx0, inpL, hparams,
6148
+ model.output_norm,
6149
+ model.output_norm_b,
6150
+ LLM_NORM, cb, -1);
6151
+ cb(cur, "result_norm", -1);
6152
+
6153
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6154
+ cb(cur, "result_output", -1);
6155
+
6156
+ ggml_build_forward_expand(gf, cur);
6157
+
6158
+ return gf;
6159
+ }
6160
+
6161
+ struct ggml_cgraph * build_mpt() {
6162
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6163
+
6164
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6165
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6166
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6167
+
6168
+ struct ggml_tensor * cur;
6169
+ struct ggml_tensor * inpL;
6170
+
6171
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6172
+ cb(inpL, "inp_embd", -1);
6173
+
6174
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6175
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6176
+ cb(KQ_mask, "KQ_mask", -1);
6177
+
6178
+ // positions of the tokens in the KV cache
6179
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6180
+ cb(KQ_pos, "KQ_pos", -1);
6181
+
6182
+ for (int il = 0; il < n_layer; ++il) {
6183
+ struct ggml_tensor * attn_norm;
6184
+
6185
+ attn_norm = llm_build_norm(ctx0, inpL, hparams,
6186
+ model.layers[il].attn_norm,
6187
+ model.layers[il].attn_norm_b,
6188
+ LLM_NORM, cb, il);
6189
+ cb(attn_norm, "attn_norm", il);
5746
6190
 
5747
6191
  // self-attention
5748
6192
  {
@@ -5751,6 +6195,11 @@ struct llm_build_context {
5751
6195
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5752
6196
  cb(cur, "wqkv", il);
5753
6197
 
6198
+ if (model.layers[il].bqkv){
6199
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6200
+ cb(cur, "bqkv", il);
6201
+ }
6202
+
5754
6203
  if (hparams.f_clamp_kqv > 0.0f) {
5755
6204
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5756
6205
  cb(cur, "wqkv_clamped", il);
@@ -5767,8 +6216,8 @@ struct llm_build_context {
5767
6216
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5768
6217
 
5769
6218
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5770
- model.layers[il].wo, NULL,
5771
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6219
+ model.layers[il].wo, model.layers[il].bo,
6220
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5772
6221
  cb(cur, "kqv_out", il);
5773
6222
  }
5774
6223
 
@@ -5780,13 +6229,13 @@ struct llm_build_context {
5780
6229
  {
5781
6230
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
5782
6231
  model.layers[il].ffn_norm,
5783
- NULL,
6232
+ model.layers[il].ffn_norm_b,
5784
6233
  LLM_NORM, cb, il);
5785
6234
  cb(cur, "ffn_norm", il);
5786
6235
  cur = llm_build_ffn(ctx0, cur,
5787
- model.layers[il].ffn_up, NULL,
6236
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5788
6237
  NULL, NULL,
5789
- model.layers[il].ffn_down, NULL,
6238
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5790
6239
  model.layers[il].ffn_act,
5791
6240
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5792
6241
  cb(cur, "ffn_out", il);
@@ -5803,7 +6252,7 @@ struct llm_build_context {
5803
6252
 
5804
6253
  cur = llm_build_norm(ctx0, cur, hparams,
5805
6254
  model.output_norm,
5806
- NULL,
6255
+ model.output_norm_b,
5807
6256
  LLM_NORM, cb, -1);
5808
6257
  cb(cur, "result_norm", -1);
5809
6258
 
@@ -5890,7 +6339,7 @@ struct llm_build_context {
5890
6339
 
5891
6340
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5892
6341
  model.layers[il].wo, NULL,
5893
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6342
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5894
6343
  cb(cur, "kqv_out", il);
5895
6344
  }
5896
6345
 
@@ -6005,7 +6454,7 @@ struct llm_build_context {
6005
6454
 
6006
6455
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6007
6456
  model.layers[il].wo, NULL,
6008
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6457
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6009
6458
  cb(cur, "kqv_out", il);
6010
6459
  }
6011
6460
 
@@ -6126,7 +6575,7 @@ struct llm_build_context {
6126
6575
 
6127
6576
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6128
6577
  model.layers[il].wo, model.layers[il].bo,
6129
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6578
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6130
6579
  cb(cur, "kqv_out", il);
6131
6580
  }
6132
6581
 
@@ -6253,7 +6702,7 @@ struct llm_build_context {
6253
6702
 
6254
6703
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6255
6704
  model.layers[il].wo, model.layers[il].bo,
6256
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
6705
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6257
6706
  cb(cur, "kqv_out", il);
6258
6707
  }
6259
6708
 
@@ -6356,7 +6805,7 @@ struct llm_build_context {
6356
6805
 
6357
6806
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6358
6807
  model.layers[il].wo, NULL,
6359
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6360
6809
  cb(cur, "kqv_out", il);
6361
6810
  }
6362
6811
  struct ggml_tensor * sa_out = cur;
@@ -6455,7 +6904,7 @@ struct llm_build_context {
6455
6904
 
6456
6905
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6457
6906
  model.layers[il].wo, model.layers[il].bo,
6458
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6907
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6459
6908
  cb(cur, "kqv_out", il);
6460
6909
  }
6461
6910
 
@@ -6564,7 +7013,7 @@ struct llm_build_context {
6564
7013
 
6565
7014
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6566
7015
  model.layers[il].wo, model.layers[il].bo,
6567
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7016
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6568
7017
  cb(cur, "kqv_out", il);
6569
7018
  }
6570
7019
 
@@ -6682,7 +7131,7 @@ struct llm_build_context {
6682
7131
 
6683
7132
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6684
7133
  model.layers[il].wo, NULL,
6685
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7134
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6686
7135
  cb(cur, "kqv_out", il);
6687
7136
  }
6688
7137
 
@@ -6801,7 +7250,7 @@ struct llm_build_context {
6801
7250
 
6802
7251
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6803
7252
  model.layers[il].wo, model.layers[il].bo,
6804
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7253
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6805
7254
  cb(cur, "kqv_out", il);
6806
7255
  }
6807
7256
 
@@ -6933,7 +7382,7 @@ struct llm_build_context {
6933
7382
 
6934
7383
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6935
7384
  model.layers[il].wo, model.layers[il].bo,
6936
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7385
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6937
7386
  cb(cur, "kqv_out", il);
6938
7387
  }
6939
7388
 
@@ -6992,16 +7441,124 @@ struct llm_build_context {
6992
7441
 
6993
7442
  return gf;
6994
7443
  }
7444
+
7445
+ struct ggml_cgraph * build_gemma() {
7446
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7447
+
7448
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
7449
+
7450
+ struct ggml_tensor * cur;
7451
+ struct ggml_tensor * inpL;
7452
+
7453
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7454
+ cb(inpL, "inp_embd", -1);
7455
+
7456
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7457
+ cb(inpL, "inp_scaled", -1);
7458
+
7459
+ // inp_pos - contains the positions
7460
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7461
+ cb(inp_pos, "inp_pos", -1);
7462
+
7463
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7464
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
+ cb(KQ_mask, "KQ_mask", -1);
7466
+
7467
+ // shift the entire K-cache if needed
7468
+ if (do_rope_shift) {
7469
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
+ }
7471
+
7472
+ for (int il = 0; il < n_layer; ++il) {
7473
+
7474
+ // norm
7475
+ cur = llm_build_norm(ctx0, inpL, hparams,
7476
+ model.layers[il].attn_norm, NULL,
7477
+ LLM_NORM_RMS, cb, il);
7478
+ cb(cur, "attn_norm", il);
7479
+
7480
+ // self-attention
7481
+ {
7482
+ // compute Q and K and RoPE them
7483
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7484
+ cb(Qcur, "Qcur", il);
7485
+
7486
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7487
+ cb(Kcur, "Kcur", il);
7488
+
7489
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7490
+ cb(Vcur, "Vcur", il);
7491
+
7492
+ Qcur = ggml_rope_custom(
7493
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7495
+ ext_factor, attn_factor, beta_fast, beta_slow);
7496
+ cb(Qcur, "Qcur", il);
7497
+
7498
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
7499
+ cb(Qcur, "Qcur_scaled", il);
7500
+
7501
+ Kcur = ggml_rope_custom(
7502
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
+ n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7504
+ ext_factor, attn_factor, beta_fast, beta_slow);
7505
+ cb(Kcur, "Kcur", il);
7506
+
7507
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
+ model.layers[il].wo, NULL,
7509
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7510
+ cb(cur, "kqv_out", il);
7511
+ }
7512
+
7513
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
7514
+ cb(sa_out, "sa_out", il);
7515
+
7516
+ cur = llm_build_norm(ctx0, sa_out, hparams,
7517
+ model.layers[il].ffn_norm, NULL,
7518
+ LLM_NORM_RMS, cb, il);
7519
+ cb(cur, "ffn_norm", il);
7520
+
7521
+ // feed-forward network
7522
+ {
7523
+ cur = llm_build_ffn(ctx0, cur,
7524
+ model.layers[il].ffn_up, NULL,
7525
+ model.layers[il].ffn_gate, NULL,
7526
+ model.layers[il].ffn_down, NULL,
7527
+ NULL,
7528
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
7529
+ cb(cur, "ffn_out", il);
7530
+ }
7531
+
7532
+ cur = ggml_add(ctx0, cur, sa_out);
7533
+ cb(cur, "l_out", il);
7534
+
7535
+ // input for next layer
7536
+ inpL = cur;
7537
+ }
7538
+
7539
+ cur = inpL;
7540
+
7541
+ cur = llm_build_norm(ctx0, cur, hparams,
7542
+ model.output_norm, NULL,
7543
+ LLM_NORM_RMS, cb, -1);
7544
+ cb(cur, "result_norm", -1);
7545
+
7546
+ // lm_head
7547
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7548
+ cb(cur, "result_output", -1);
7549
+
7550
+ ggml_build_forward_expand(gf, cur);
7551
+
7552
+ return gf;
7553
+ }
6995
7554
  };
6996
7555
 
6997
7556
  static struct ggml_cgraph * llama_build_graph(
6998
7557
  llama_context & lctx,
6999
- const llama_batch & batch) {
7558
+ const llama_batch & batch,
7559
+ bool worst_case) {
7000
7560
  const auto & model = lctx.model;
7001
7561
 
7002
- // check if we should build the worst-case graph (for memory measurement)
7003
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7004
-
7005
7562
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7006
7563
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7007
7564
  if (il >= 0) {
@@ -7022,67 +7579,6 @@ static struct ggml_cgraph * llama_build_graph(
7022
7579
 
7023
7580
  struct llm_build_context llm(lctx, batch, cb, worst_case);
7024
7581
 
7025
- //
7026
- // set input data
7027
- //
7028
-
7029
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
7030
- if (batch.token) {
7031
- const int64_t n_tokens = batch.n_tokens;
7032
-
7033
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7034
- }
7035
-
7036
- if (batch.embd) {
7037
- const int64_t n_embd = llm.n_embd;
7038
- const int64_t n_tokens = batch.n_tokens;
7039
-
7040
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7041
- }
7042
-
7043
- if (batch.pos) {
7044
- const int64_t n_tokens = batch.n_tokens;
7045
-
7046
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7047
- }
7048
-
7049
- {
7050
- const int64_t n_kv = llm.n_kv;
7051
- const int64_t n_tokens = batch.n_tokens;
7052
-
7053
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7054
- float * data = (float *) lctx.inp_KQ_mask->data;
7055
-
7056
- for (int h = 0; h < 1; ++h) {
7057
- for (int j = 0; j < n_tokens; ++j) {
7058
- const llama_pos pos = batch.pos[j];
7059
- const llama_seq_id seq_id = batch.seq_id[j][0];
7060
-
7061
- for (int i = 0; i < n_kv; ++i) {
7062
- float f;
7063
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7064
- f = -INFINITY;
7065
- } else {
7066
- f = 0;
7067
- }
7068
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7069
- }
7070
- }
7071
- }
7072
- }
7073
-
7074
- if (llm.do_rope_shift) {
7075
- const int64_t n_ctx = llm.n_ctx;
7076
-
7077
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7078
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7079
-
7080
- for (int i = 0; i < n_ctx; ++i) {
7081
- data[i] = lctx.kv_self.cells[i].delta;
7082
- }
7083
- }
7084
- }
7085
-
7086
7582
  llm.init();
7087
7583
 
7088
7584
  switch (model.arch) {
@@ -7110,6 +7606,11 @@ static struct ggml_cgraph * llama_build_graph(
7110
7606
  {
7111
7607
  result = llm.build_refact();
7112
7608
  } break;
7609
+ case LLM_ARCH_BERT:
7610
+ case LLM_ARCH_NOMIC_BERT:
7611
+ {
7612
+ result = llm.build_bert();
7613
+ } break;
7113
7614
  case LLM_ARCH_BLOOM:
7114
7615
  {
7115
7616
  result = llm.build_bloom();
@@ -7158,6 +7659,10 @@ static struct ggml_cgraph * llama_build_graph(
7158
7659
  {
7159
7660
  result = llm.build_minicpm();
7160
7661
  } break;
7662
+ case LLM_ARCH_GEMMA:
7663
+ {
7664
+ result = llm.build_gemma();
7665
+ } break;
7161
7666
  default:
7162
7667
  GGML_ASSERT(false);
7163
7668
  }
@@ -7167,6 +7672,129 @@ static struct ggml_cgraph * llama_build_graph(
7167
7672
  return result;
7168
7673
  }
7169
7674
 
7675
+ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7676
+ //
7677
+ // set input data
7678
+ //
7679
+
7680
+ const auto & hparams = lctx.model.hparams;
7681
+ const auto & cparams = lctx.cparams;
7682
+ const auto & kv_self = lctx.kv_self;
7683
+
7684
+ if (batch.token) {
7685
+ const int64_t n_tokens = batch.n_tokens;
7686
+
7687
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7688
+ }
7689
+
7690
+ if (batch.embd) {
7691
+ const int64_t n_embd = hparams.n_embd;
7692
+ const int64_t n_tokens = batch.n_tokens;
7693
+
7694
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7695
+ }
7696
+
7697
+ if (batch.pos) {
7698
+ const int64_t n_tokens = batch.n_tokens;
7699
+
7700
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7701
+ }
7702
+
7703
+ {
7704
+ const int64_t n_kv = kv_self.n;
7705
+ const int64_t n_tokens = batch.n_tokens;
7706
+
7707
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7708
+
7709
+ float * data = (float *) lctx.inp_KQ_mask->data;
7710
+
7711
+ for (int h = 0; h < 1; ++h) {
7712
+ for (int j = 0; j < n_tokens; ++j) {
7713
+ const llama_pos pos = batch.pos[j];
7714
+ const llama_seq_id seq_id = batch.seq_id[j][0];
7715
+
7716
+ for (int i = 0; i < n_kv; ++i) {
7717
+ float f;
7718
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7719
+ (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7720
+ f = -INFINITY;
7721
+ } else {
7722
+ f = 0;
7723
+ }
7724
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7725
+ }
7726
+ }
7727
+ }
7728
+ }
7729
+
7730
+ if (hparams.need_kq_pos) {
7731
+ const int64_t n_kv = kv_self.n;
7732
+
7733
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7734
+
7735
+ float * data = (float *) lctx.inp_KQ_pos->data;
7736
+
7737
+ for (int i = 0; i < n_kv; ++i) {
7738
+ data[i] = float(lctx.kv_self.cells[i].pos);
7739
+ }
7740
+ }
7741
+
7742
+ if (kv_self.has_shift) {
7743
+ const int64_t n_ctx = cparams.n_ctx;
7744
+
7745
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7746
+
7747
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7748
+
7749
+ for (int i = 0; i < n_ctx; ++i) {
7750
+ data[i] = lctx.kv_self.cells[i].delta;
7751
+ }
7752
+ }
7753
+
7754
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7755
+ const int64_t n_tokens = batch.n_tokens;
7756
+
7757
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7758
+ float * data = (float *) lctx.inp_mean->data;
7759
+
7760
+ memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7761
+
7762
+ std::vector<uint64_t> sum(n_tokens, 0);
7763
+ for (int i = 0; i < n_tokens; ++i) {
7764
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7765
+ sum[seq_id] += 1;
7766
+ }
7767
+
7768
+ std::vector<float> div(n_tokens, 0.0f);
7769
+ for (int i = 0; i < n_tokens; ++i) {
7770
+ const uint64_t s = sum[i];
7771
+ if (s > 0) {
7772
+ div[i] = 1.0f/float(s);
7773
+ }
7774
+ }
7775
+
7776
+ for (int i = 0; i < n_tokens; ++i) {
7777
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7778
+ data[seq_id*n_tokens + i] = div[seq_id];
7779
+ }
7780
+ }
7781
+
7782
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7783
+ const int64_t n_tokens = batch.n_tokens;
7784
+
7785
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
7786
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
7787
+
7788
+ for (int i = 0; i < n_tokens; ++i) {
7789
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7790
+ const llama_pos pos = batch.pos[i];
7791
+ if (pos == 0) {
7792
+ data[seq_id] = i;
7793
+ }
7794
+ }
7795
+ }
7796
+ }
7797
+
7170
7798
  // decode a batch of tokens by evaluating the transformer
7171
7799
  //
7172
7800
  // - lctx: llama context
@@ -7265,17 +7893,22 @@ static int llama_decode_internal(
7265
7893
  ggml_backend_sched_reset(lctx.sched);
7266
7894
  ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7267
7895
 
7268
- ggml_cgraph * gf = llama_build_graph(lctx, batch);
7896
+ ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7269
7897
 
7270
7898
  // the output is always the last tensor in the graph
7271
7899
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7272
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
7273
-
7274
- // the embeddings could be the second to last tensor, or the third to last tensor
7275
7900
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7276
- if (strcmp(embeddings->name, "result_norm") != 0) {
7277
- embeddings = gf->nodes[gf->n_nodes - 3];
7278
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7901
+ if (strcmp(res->name, "result_output") == 0) {
7902
+ // the embeddings could be the second to last tensor, or the third to last tensor
7903
+ if (strcmp(embeddings->name, "result_norm") != 0) {
7904
+ embeddings = gf->nodes[gf->n_nodes - 3];
7905
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7906
+ }
7907
+ } else if (strcmp(res->name, "result_embd") == 0) {
7908
+ embeddings = res;
7909
+ res = nullptr;
7910
+ } else {
7911
+ GGML_ASSERT(false);
7279
7912
  }
7280
7913
 
7281
7914
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7285,7 +7918,9 @@ static int llama_decode_internal(
7285
7918
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7286
7919
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7287
7920
  // with the BLAS calls. need a better solution
7288
- if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7921
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7922
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7923
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7289
7924
  n_threads = std::min(4, n_threads);
7290
7925
  }
7291
7926
 
@@ -7303,6 +7938,9 @@ static int llama_decode_internal(
7303
7938
  if (lctx.backend_cpu != nullptr) {
7304
7939
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7305
7940
  }
7941
+
7942
+ llama_set_inputs(lctx, batch);
7943
+
7306
7944
  ggml_backend_sched_graph_compute(lctx.sched, gf);
7307
7945
 
7308
7946
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
@@ -7342,7 +7980,7 @@ static int llama_decode_internal(
7342
7980
  // extract logits
7343
7981
  // TODO: do not compute and extract logits if only embeddings are needed
7344
7982
  // need to update the graphs to skip "result_output"
7345
- {
7983
+ if (res) {
7346
7984
  auto & logits_out = lctx.logits;
7347
7985
 
7348
7986
  #ifndef NDEBUG
@@ -7386,9 +8024,12 @@ static int llama_decode_internal(
7386
8024
  if (!lctx.embedding.empty()) {
7387
8025
  auto & embedding_out = lctx.embedding;
7388
8026
 
7389
- embedding_out.resize(n_embd);
8027
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8028
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8029
+
8030
+ embedding_out.resize(embd_size);
7390
8031
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7391
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
8032
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
7392
8033
  ggml_backend_synchronize(embeddings_backend);
7393
8034
  }
7394
8035
 
@@ -7452,6 +8093,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
7452
8093
  GGML_ASSERT(false);
7453
8094
  return unicode_to_bytes_bpe(token_data.text);
7454
8095
  }
8096
+ case LLAMA_VOCAB_TYPE_WPM: {
8097
+ GGML_ASSERT(false);
8098
+ }
7455
8099
  default:
7456
8100
  GGML_ASSERT(false);
7457
8101
  }
@@ -7462,8 +8106,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7462
8106
  switch (llama_vocab_get_type(vocab)) {
7463
8107
  case LLAMA_VOCAB_TYPE_SPM: {
7464
8108
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7465
- return vocab.token_to_id.at(buf);
8109
+ auto token = vocab.token_to_id.find(buf);
8110
+ if (token != vocab.token_to_id.end()) {
8111
+ return (*token).second;
8112
+ }
8113
+ // Try to fall back to just the byte as a string
8114
+ const char buf2[2] = { (char)ch, 0 };
8115
+ return vocab.token_to_id.at(buf2);
7466
8116
  }
8117
+ case LLAMA_VOCAB_TYPE_WPM:
7467
8118
  case LLAMA_VOCAB_TYPE_BPE: {
7468
8119
  return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
7469
8120
  }
@@ -7509,7 +8160,7 @@ struct llm_bigram_spm {
7509
8160
  };
7510
8161
 
7511
8162
  struct llm_tokenizer_spm {
7512
- llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
8163
+ llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
7513
8164
 
7514
8165
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
7515
8166
  // split string into utf8 chars
@@ -7584,6 +8235,7 @@ private:
7584
8235
 
7585
8236
  if (p == rev_merge.end()) {
7586
8237
  // output any symbols that did not form tokens as bytes.
8238
+ output.reserve(output.size() + symbol.n);
7587
8239
  for (int j = 0; j < (int)symbol.n; ++j) {
7588
8240
  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
7589
8241
  output.push_back(token_id);
@@ -7934,29 +8586,230 @@ private:
7934
8586
  llm_bigram_bpe::queue work_queue;
7935
8587
  };
7936
8588
 
7937
- typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
8589
+ struct llm_tokenizer_wpm {
8590
+ llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
8591
+
8592
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
8593
+ auto * token_map = &vocab.token_to_id;
8594
+
8595
+ // normalize and split by whitespace
8596
+ std::vector<std::string> words = preprocess(text);
8597
+
8598
+ // bos token prepended already
8599
+
8600
+ // find the longest tokens that form the words
8601
+ for (const std::string &word : words) {
8602
+ // skip empty words
8603
+ if (word.size() == 0) {
8604
+ continue;
8605
+ }
8606
+
8607
+ // prepend phantom space
8608
+ std::string word1 = "\xe2\x96\x81" + word;
8609
+ int n = word1.size();
8610
+
8611
+ // we're at the start of a new word
8612
+ int i = 0;
8613
+ bool match_any = false;
8614
+
8615
+ // move through character position in word
8616
+ while (i < n) {
8617
+ // loop through possible match length
8618
+ bool match = false;
8619
+ for (int j = n; j > i; j--) {
8620
+ auto it = token_map->find(word1.substr(i, j - i));
8621
+ if (it != token_map->end()) {
8622
+ output.push_back(it->second);
8623
+ match = true;
8624
+ match_any = true;
8625
+ i = j;
8626
+ break;
8627
+ }
8628
+ }
8629
+
8630
+ // must be an unknown character
8631
+ if (!match) {
8632
+ i++;
8633
+ }
8634
+ }
8635
+
8636
+ // we didn't find any matches for this word
8637
+ if (!match_any) {
8638
+ output.push_back(vocab.special_unk_id);
8639
+ }
8640
+ }
8641
+
8642
+ // append eos token
8643
+ output.push_back(vocab.special_eos_id);
8644
+ }
8645
+
8646
+ std::vector<std::string> preprocess(const std::string & text) {
8647
+ std::string ori_str = normalize(text);
8648
+ uint64_t ori_size = ori_str.size();
8649
+
8650
+ // single punct / single symbol / single digit
8651
+ // baseline: add whitespace on the left and right of punct and chinese characters
8652
+ std::vector<std::string> words;
8653
+ std::string new_str = "";
8654
+ uint64_t i = 0;
8655
+ while (i < ori_size) {
8656
+ int utf_char_len = utf8_len(ori_str[i]);
8657
+ if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8658
+ new_str += " ";
8659
+ new_str += ori_str[i];
8660
+ new_str += " ";
8661
+ i += 1;
8662
+ }
8663
+ else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8664
+ new_str += " ";
8665
+ new_str += ori_str.substr(i, 3);
8666
+ new_str += " ";
8667
+ i += 3;
8668
+ }
8669
+ else {
8670
+ new_str += ori_str[i];
8671
+ i += 1;
8672
+ }
8673
+ }
8674
+
8675
+ // split by whitespace
8676
+ uint64_t l = 0;
8677
+ uint64_t r = 0;
8678
+ while (r < new_str.size()) {
8679
+ // if is whitespace
8680
+ if (isspace(new_str[r])) {
8681
+ if (r > l) words.push_back(new_str.substr(l, (r - l)));
8682
+ l = r + 1;
8683
+ r = l;
8684
+ }
8685
+ else {
8686
+ r += 1;
8687
+ }
8688
+ }
8689
+ if (r > l) {
8690
+ words.push_back(new_str.substr(l, (r - l)));
8691
+ }
8692
+ return words;
8693
+ }
8694
+
8695
+ std::string normalize(const std::string & text) {
8696
+ // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8697
+ std::string text2 = strip_accents(text);
8698
+ for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8699
+ char c = text2[i];
8700
+ if (c >= 'A' && c <= 'Z') {
8701
+ text2[i] = c - 'A' + 'a';
8702
+ }
8703
+ }
8704
+ return text2;
8705
+ }
8706
+
8707
+ bool is_chinese_char(const std::string & str) {
8708
+ int len = str.length();
8709
+ unsigned int codepoint = 0;
8710
+ int num_bytes = 0;
8711
+ int i = 0;
8712
+ unsigned char ch = static_cast<unsigned char>(str[i]);
8713
+ if (ch <= 0x7f) {
8714
+ codepoint = ch;
8715
+ num_bytes = 1;
8716
+ } else if ((ch >> 5) == 0x06) {
8717
+ codepoint = ch & 0x1f;
8718
+ num_bytes = 2;
8719
+ } else if ((ch >> 4) == 0x0e) {
8720
+ codepoint = ch & 0x0f;
8721
+ num_bytes = 3;
8722
+ } else if ((ch >> 3) == 0x1e) {
8723
+ codepoint = ch & 0x07;
8724
+ num_bytes = 4;
8725
+ }
8726
+ for (int j = 1; j < num_bytes; ++j) {
8727
+ if (i + j >= len) {
8728
+ return false; // incomplete UTF-8 character
8729
+ }
8730
+ unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8731
+ if ((next_ch >> 6) != 0x02) {
8732
+ return false; // invalid trailing byte
8733
+ }
8734
+ codepoint = (codepoint << 6) | (next_ch & 0x3f);
8735
+ }
8736
+ if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8737
+ (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8738
+ (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
8739
+ (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
8740
+ (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
8741
+ (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
8742
+ (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
8743
+ (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
8744
+ (codepoint >= 0x3000 && codepoint <= 0x303F) ||
8745
+ (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
8746
+ return true; // NOLINT
8747
+ }
8748
+ return false;
8749
+ }
8750
+
8751
+ std::string strip_accents(const std::string & input_string) {
8752
+ std::string resultString;
8753
+ std::map<std::string, char> accent_map = {
8754
+ {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8755
+ {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8756
+ {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8757
+ {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8758
+ {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8759
+ {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8760
+ {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8761
+ {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8762
+ {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8763
+ };
8764
+
8765
+ for (size_t i = 0; i < input_string.length();) {
8766
+ int len = utf8_len(input_string[i]);
8767
+ std::string curChar = input_string.substr(i, len);
8768
+ auto iter = accent_map.find(curChar);
8769
+ if (iter != accent_map.end()) {
8770
+ resultString += iter->second;
8771
+ } else {
8772
+ resultString += curChar;
8773
+ }
8774
+ i += len;
8775
+ }
8776
+
8777
+ return resultString;
8778
+ }
8779
+
8780
+ static size_t utf8_len(char src) {
8781
+ const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8782
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8783
+ return lookup[highbits];
8784
+ }
8785
+
8786
+ const llama_vocab & vocab;
8787
+ };
8788
+
8789
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
7938
8790
  FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
7939
8791
  FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
7940
8792
  } FRAGMENT_BUFFER_VARIANT_TYPE;
7941
8793
 
7942
- struct fragment_buffer_variant{
8794
+ struct fragment_buffer_variant {
7943
8795
  fragment_buffer_variant(llama_vocab::id _token)
7944
8796
  :
7945
8797
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
7946
8798
  token(_token),
7947
8799
  raw_text(_dummy),
7948
8800
  offset(0),
7949
- length(0){}
8801
+ length(0) {}
8802
+
7950
8803
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
7951
8804
  :
7952
8805
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
7953
- token((llama_vocab::id)-1),
8806
+ token((llama_vocab::id) - 1),
7954
8807
  raw_text(_raw_text),
7955
8808
  offset(_offset),
7956
8809
  length(_length){
7957
- GGML_ASSERT( _offset >= 0 );
7958
- GGML_ASSERT( _length >= 1 );
7959
- GGML_ASSERT( offset + length <= raw_text.length() );
8810
+ GGML_ASSERT(_offset >= 0);
8811
+ GGML_ASSERT(_length >= 1);
8812
+ GGML_ASSERT(offset + length <= raw_text.length());
7960
8813
  }
7961
8814
 
7962
8815
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -7969,8 +8822,7 @@ struct fragment_buffer_variant{
7969
8822
 
7970
8823
  // #define PRETOKENIZERDEBUG
7971
8824
 
7972
- static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
7973
- {
8825
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
7974
8826
  // for each special token
7975
8827
  for (const auto & st: vocab.special_tokens_cache) {
7976
8828
  const auto & special_token = st.first;
@@ -8081,17 +8933,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8081
8933
  }
8082
8934
 
8083
8935
  std::forward_list<fragment_buffer_variant> fragment_buffer;
8084
- fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
8936
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
8085
8937
 
8086
- if (special) tokenizer_st_partition( vocab, fragment_buffer );
8938
+ if (special) tokenizer_st_partition(vocab, fragment_buffer);
8087
8939
 
8088
8940
  switch (vocab.type) {
8089
8941
  case LLAMA_VOCAB_TYPE_SPM:
8090
8942
  {
8091
- for (const auto & fragment: fragment_buffer)
8092
- {
8093
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8094
- {
8943
+ for (const auto & fragment : fragment_buffer) {
8944
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8095
8945
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8096
8946
 
8097
8947
  // TODO: It's likely possible to get rid of this string copy entirely
@@ -8111,19 +8961,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8111
8961
  llm_tokenizer_spm tokenizer(vocab);
8112
8962
  llama_escape_whitespace(raw_text);
8113
8963
  tokenizer.tokenize(raw_text, output);
8114
- }
8115
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8116
- {
8964
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8117
8965
  output.push_back(fragment.token);
8118
8966
  }
8119
8967
  }
8120
8968
  } break;
8121
8969
  case LLAMA_VOCAB_TYPE_BPE:
8122
8970
  {
8123
- for (const auto & fragment: fragment_buffer)
8124
- {
8125
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8126
- {
8971
+ for (const auto & fragment : fragment_buffer) {
8972
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8127
8973
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8128
8974
 
8129
8975
  #ifdef PRETOKENIZERDEBUG
@@ -8131,9 +8977,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8131
8977
  #endif
8132
8978
  llm_tokenizer_bpe tokenizer(vocab);
8133
8979
  tokenizer.tokenize(raw_text, output);
8980
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8981
+ output.push_back(fragment.token);
8134
8982
  }
8135
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8136
- {
8983
+ }
8984
+ } break;
8985
+ case LLAMA_VOCAB_TYPE_WPM:
8986
+ {
8987
+ for (const auto & fragment : fragment_buffer) {
8988
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8989
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8990
+
8991
+ #ifdef PRETOKENIZERDEBUG
8992
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
8993
+ #endif
8994
+ llm_tokenizer_wpm tokenizer(vocab);
8995
+ tokenizer.tokenize(raw_text, output);
8996
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8137
8997
  output.push_back(fragment.token);
8138
8998
  }
8139
8999
  }
@@ -9640,25 +10500,28 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9640
10500
  return std::make_pair(i_layer, n_layer);
9641
10501
  };
9642
10502
 
9643
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
10503
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10504
+ // with the quantization of the output tensor
10505
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10506
+ (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
9644
10507
  int nx = tensor->ne[0];
9645
10508
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
9646
10509
  new_type = GGML_TYPE_Q8_0;
9647
10510
  }
9648
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10511
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
9649
10512
  new_type = GGML_TYPE_Q5_K;
9650
10513
  }
9651
10514
  else if (new_type != GGML_TYPE_Q8_0) {
9652
10515
  new_type = GGML_TYPE_Q6_K;
9653
10516
  }
9654
10517
  } else if (name == "token_embd.weight") {
9655
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10518
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
9656
10519
  new_type = GGML_TYPE_Q2_K;
9657
10520
  }
9658
10521
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9659
10522
  new_type = GGML_TYPE_Q4_K;
9660
10523
  }
9661
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10524
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
9662
10525
  if (name.find("attn_v.weight") != std::string::npos) {
9663
10526
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
9664
10527
  else new_type = GGML_TYPE_Q2_K;
@@ -9668,6 +10531,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9668
10531
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
9669
10532
  ++qs.i_ffn_down;
9670
10533
  }
10534
+ else if (name.find("attn_output.weight") != std::string::npos) {
10535
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10536
+ }
9671
10537
  } else if (name.find("attn_v.weight") != std::string::npos) {
9672
10538
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
9673
10539
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -9682,6 +10548,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9682
10548
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
9683
10549
  }
9684
10550
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
10552
+ new_type = GGML_TYPE_Q5_K;
10553
+ }
9685
10554
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
9686
10555
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
9687
10556
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -9734,6 +10603,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9734
10603
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
9735
10604
  }
9736
10605
  }
10606
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
10608
+ }
9737
10609
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
9738
10610
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
9739
10611
  new_type = GGML_TYPE_Q5_K;
@@ -9750,7 +10622,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9750
10622
  if (arch != LLM_ARCH_FALCON) {
9751
10623
  if (qs.model.hparams.n_expert == 8) {
9752
10624
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
9753
- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
10625
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
9754
10626
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
9755
10627
  new_type = GGML_TYPE_Q5_K;
9756
10628
  }
@@ -9785,6 +10657,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9785
10657
  }
9786
10658
  ++qs.i_ffn_up;
9787
10659
  }
10660
+
9788
10661
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9789
10662
  //}
9790
10663
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
@@ -9800,7 +10673,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9800
10673
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
9801
10674
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
9802
10675
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
9803
- new_type == GGML_TYPE_IQ3_XXS) {
10676
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
9804
10677
  int nx = tensor->ne[0];
9805
10678
  int ny = tensor->ne[1];
9806
10679
  if (nx % QK_K != 0) {
@@ -9815,8 +10688,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9815
10688
  case GGML_TYPE_IQ2_XXS:
9816
10689
  case GGML_TYPE_IQ2_XS:
9817
10690
  case GGML_TYPE_IQ3_XXS:
9818
- case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
9819
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10691
+ case GGML_TYPE_IQ1_S:
10692
+ case GGML_TYPE_Q2_K:
10693
+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
9820
10694
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
9821
10695
  case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
9822
10696
  case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
@@ -9844,19 +10718,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9844
10718
 
9845
10719
  // K-quants
9846
10720
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
9847
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10721
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9848
10722
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
9849
10723
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9850
10724
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9851
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
10725
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
9852
10726
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
9853
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
10727
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
9854
10728
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9855
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9856
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9857
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9858
- case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9859
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
10729
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10730
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10731
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10732
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10733
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
9860
10736
 
9861
10737
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9862
10738
  }
@@ -9986,7 +10862,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9986
10862
  quantize &= !params->only_copy;
9987
10863
 
9988
10864
  // do not quantize expert gating tensors
9989
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10865
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
10866
+
10867
+ // do not quantize positional embeddings and token types (BERT)
10868
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
10869
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
9990
10870
 
9991
10871
  enum ggml_type new_type;
9992
10872
  void * new_data;
@@ -10026,6 +10906,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10026
10906
  }
10027
10907
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10028
10908
  new_type == GGML_TYPE_IQ2_XS ||
10909
+ new_type == GGML_TYPE_IQ1_S ||
10029
10910
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10030
10911
  LLAMA_LOG_ERROR("\n\n============================================================\n");
10031
10912
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -10260,7 +11141,7 @@ static int llama_apply_lora_from_file_internal(
10260
11141
  {
10261
11142
  LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
10262
11143
  __func__, ftype);
10263
- return false;
11144
+ return 1;
10264
11145
  }
10265
11146
  }
10266
11147
 
@@ -10488,6 +11369,7 @@ struct llama_context_params llama_context_default_params() {
10488
11369
  /*.logits_all =*/ false,
10489
11370
  /*.embedding =*/ false,
10490
11371
  /*.offload_kqv =*/ true,
11372
+ /*.do_pooling =*/ true,
10491
11373
  };
10492
11374
 
10493
11375
  return result;
@@ -10548,7 +11430,7 @@ bool llama_mlock_supported(void) {
10548
11430
  return llama_supports_mlock();
10549
11431
  }
10550
11432
 
10551
- void llama_backend_init(bool numa) {
11433
+ void llama_backend_init(void) {
10552
11434
  ggml_time_init();
10553
11435
 
10554
11436
  // needed to initialize f16 tables
@@ -10558,15 +11440,17 @@ void llama_backend_init(bool numa) {
10558
11440
  ggml_free(ctx);
10559
11441
  }
10560
11442
 
10561
- if (numa) {
10562
- ggml_numa_init();
10563
- }
10564
-
10565
11443
  #ifdef GGML_USE_MPI
10566
11444
  ggml_mpi_backend_init();
10567
11445
  #endif
10568
11446
  }
10569
11447
 
11448
+ void llama_numa_init(enum ggml_numa_strategy numa) {
11449
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
11450
+ ggml_numa_init(numa);
11451
+ }
11452
+ }
11453
+
10570
11454
  void llama_backend_free(void) {
10571
11455
  #ifdef GGML_USE_MPI
10572
11456
  ggml_mpi_backend_free();
@@ -10643,6 +11527,7 @@ struct llama_context * llama_new_context_with_model(
10643
11527
  cparams.yarn_beta_slow = params.yarn_beta_slow;
10644
11528
  cparams.mul_mat_q = params.mul_mat_q;
10645
11529
  cparams.offload_kqv = params.offload_kqv;
11530
+ cparams.do_pooling = params.do_pooling;
10646
11531
 
10647
11532
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
10648
11533
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -10790,14 +11675,14 @@ struct llama_context * llama_new_context_with_model(
10790
11675
  // resized during inference, reserve maximum
10791
11676
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
10792
11677
 
10793
- if (params.embedding){
11678
+ if (params.embedding) {
10794
11679
  ctx->embedding.resize(hparams.n_embd);
10795
11680
  }
10796
11681
 
10797
11682
  // graph inputs
10798
11683
  {
10799
11684
  ggml_init_params init_params = {
10800
- /* .mem_size */ ggml_tensor_overhead()*5,
11685
+ /* .mem_size */ ggml_tensor_overhead()*8,
10801
11686
  /* .mem_buffer */ nullptr,
10802
11687
  /* .no_alloc */ true,
10803
11688
  };
@@ -10807,13 +11692,19 @@ struct llama_context * llama_new_context_with_model(
10807
11692
  ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
10808
11693
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10809
11694
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11695
+ ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
10810
11696
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11697
+ ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11698
+ ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10811
11699
 
10812
11700
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
10813
11701
  ggml_set_name(ctx->inp_embd, "inp_embd");
10814
11702
  ggml_set_name(ctx->inp_pos, "inp_pos");
10815
11703
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11704
+ ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
10816
11705
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11706
+ ggml_set_name(ctx->inp_mean, "inp_mean");
11707
+ ggml_set_name(ctx->inp_cls, "inp_cls");
10817
11708
 
10818
11709
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10819
11710
 
@@ -10839,23 +11730,27 @@ struct llama_context * llama_new_context_with_model(
10839
11730
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
10840
11731
 
10841
11732
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
10842
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
10843
11733
 
10844
11734
  // build worst-case graph
10845
11735
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
10846
11736
  int n_past = cparams.n_ctx - n_tokens;
10847
11737
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
10848
- ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
11738
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
10849
11739
 
10850
11740
  // initialize scheduler with the worst-case graph
10851
- ggml_backend_sched_init_measure(ctx->sched, gf);
10852
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
11741
+ if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
11742
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
11743
+ llama_free(ctx);
11744
+ return nullptr;
11745
+ }
10853
11746
 
10854
- for (ggml_backend_t backend : ctx->backends) {
10855
- ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
11747
+ for (size_t i = 0; i < ctx->backends.size(); i++) {
11748
+ ggml_backend_t backend = ctx->backends[i];
11749
+ ggml_backend_buffer_type_t buft = backend_buft[i];
11750
+ size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
10856
11751
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10857
- ggml_backend_buffer_name(buf),
10858
- ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
11752
+ ggml_backend_buft_name(buft),
11753
+ size / 1024.0 / 1024.0);
10859
11754
  }
10860
11755
 
10861
11756
  // note: the number of splits during measure is higher than during inference due to the kv shift
@@ -11301,18 +12196,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
11301
12196
  data_ctx->write(&kv_used, sizeof(kv_used));
11302
12197
 
11303
12198
  if (kv_buf_size) {
11304
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11305
-
11306
12199
  std::vector<uint8_t> tmp_buf;
11307
12200
  for (int il = 0; il < (int) n_layer; ++il) {
11308
- tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
12201
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12202
+ tmp_buf.resize(k_size);
11309
12203
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
11310
12204
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11311
12205
 
11312
12206
  // v is not contiguous, copy row by row
11313
- tmp_buf.resize(elt_size*kv_head);
12207
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12209
+ tmp_buf.resize(v_row_size);
11314
12210
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11315
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
12211
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
11316
12212
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11317
12213
  }
11318
12214
  }
@@ -11414,17 +12310,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11414
12310
  if (kv_buf_size) {
11415
12311
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
11416
12312
 
11417
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11418
-
11419
12313
  for (int il = 0; il < (int) n_layer; ++il) {
11420
- size_t k_size = elt_size*n_embd_k_gqa*kv_head;
12314
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
11421
12315
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
11422
12316
  inp += k_size;
11423
12317
 
11424
12318
  // v is not contiguous, copy row by row
11425
- size_t v_row_size = elt_size*kv_head;
12319
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
11426
12321
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11427
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
12322
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
11428
12323
  inp += v_row_size;
11429
12324
  }
11430
12325
  }
@@ -11660,6 +12555,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
11660
12555
  return ctx->embedding.data();
11661
12556
  }
11662
12557
 
12558
+ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12559
+ return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
12560
+ }
12561
+
11663
12562
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
11664
12563
  return model->vocab.id_to_token[token].text.c_str();
11665
12564
  }
@@ -11744,6 +12643,7 @@ static std::string llama_decode_text(const std::string & text) {
11744
12643
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
11745
12644
  if (0 <= token && token < llama_n_vocab(model)) {
11746
12645
  switch (llama_vocab_get_type(model->vocab)) {
12646
+ case LLAMA_VOCAB_TYPE_WPM:
11747
12647
  case LLAMA_VOCAB_TYPE_SPM: {
11748
12648
  // NOTE: we accept all unsupported token types,
11749
12649
  // suppressing them like CONTROL tokens.
@@ -11809,6 +12709,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
11809
12709
  return 0;
11810
12710
  }
11811
12711
 
12712
+ // trim whitespace from the beginning and end of a string
12713
+ static std::string trim(const std::string & str) {
12714
+ size_t start = 0;
12715
+ size_t end = str.size();
12716
+ while (start < end && isspace(str[start])) {
12717
+ start += 1;
12718
+ }
12719
+ while (end > start && isspace(str[end - 1])) {
12720
+ end -= 1;
12721
+ }
12722
+ return str.substr(start, end - start);
12723
+ }
12724
+
12725
+ // Simple version of "llama_apply_chat_template" that only works with strings
12726
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
12727
+ static int32_t llama_chat_apply_template_internal(
12728
+ const std::string & tmpl,
12729
+ const std::vector<const llama_chat_message *> & chat,
12730
+ std::string & dest, bool add_ass) {
12731
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12732
+ std::stringstream ss;
12733
+ if (tmpl.find("<|im_start|>") != std::string::npos) {
12734
+ // chatml template
12735
+ for (auto message : chat) {
12736
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
12737
+ }
12738
+ if (add_ass) {
12739
+ ss << "<|im_start|>assistant\n";
12740
+ }
12741
+ } else if (tmpl.find("[INST]") != std::string::npos) {
12742
+ // llama2 template and its variants
12743
+ // [variant] support system message
12744
+ bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
12745
+ // [variant] space before + after response
12746
+ bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
12747
+ // [variant] add BOS inside history
12748
+ bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
12749
+ // [variant] trim spaces from the input message
12750
+ bool strip_message = tmpl.find("content.strip()") != std::string::npos;
12751
+ // construct the prompt
12752
+ bool is_inside_turn = true; // skip BOS at the beginning
12753
+ ss << "[INST] ";
12754
+ for (auto message : chat) {
12755
+ std::string content = strip_message ? trim(message->content) : message->content;
12756
+ std::string role(message->role);
12757
+ if (!is_inside_turn) {
12758
+ is_inside_turn = true;
12759
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
12760
+ }
12761
+ if (role == "system") {
12762
+ if (support_system_message) {
12763
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
12764
+ } else {
12765
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
12766
+ ss << content << "\n";
12767
+ }
12768
+ } else if (role == "user") {
12769
+ ss << content << " [/INST]";
12770
+ } else {
12771
+ ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
12772
+ is_inside_turn = false;
12773
+ }
12774
+ }
12775
+ // llama2 templates seem to not care about "add_generation_prompt"
12776
+ } else if (tmpl.find("<|user|>") != std::string::npos) {
12777
+ // zephyr template
12778
+ for (auto message : chat) {
12779
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
12780
+ }
12781
+ if (add_ass) {
12782
+ ss << "<|assistant|>\n";
12783
+ }
12784
+ } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
12785
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
12786
+ for (auto message : chat) {
12787
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
12788
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
12789
+ }
12790
+ if (add_ass) {
12791
+ ss << "<s>assistant\n";
12792
+ }
12793
+ } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
12794
+ // google/gemma-7b-it
12795
+ std::string system_prompt = "";
12796
+ for (auto message : chat) {
12797
+ std::string role(message->role);
12798
+ if (role == "system") {
12799
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
12800
+ system_prompt = trim(message->content);
12801
+ continue;
12802
+ }
12803
+ // in gemma, "assistant" is "model"
12804
+ role = role == "assistant" ? "model" : message->role;
12805
+ ss << "<start_of_turn>" << role << "\n";
12806
+ if (!system_prompt.empty() && role != "model") {
12807
+ ss << system_prompt << "\n\n";
12808
+ system_prompt = "";
12809
+ }
12810
+ ss << trim(message->content) << "<end_of_turn>\n";
12811
+ }
12812
+ if (add_ass) {
12813
+ ss << "<start_of_turn>model\n";
12814
+ }
12815
+ } else {
12816
+ // template not supported
12817
+ return -1;
12818
+ }
12819
+ dest = ss.str();
12820
+ return dest.size();
12821
+ }
12822
+
12823
+ LLAMA_API int32_t llama_chat_apply_template(
12824
+ const struct llama_model * model,
12825
+ const char * tmpl,
12826
+ const struct llama_chat_message * chat,
12827
+ size_t n_msg,
12828
+ bool add_ass,
12829
+ char * buf,
12830
+ int32_t length) {
12831
+ std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
12832
+ if (tmpl == nullptr) {
12833
+ GGML_ASSERT(model != nullptr);
12834
+ // load template from model
12835
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12836
+ std::string template_key = "tokenizer.chat_template";
12837
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
12838
+ if (res < 0) {
12839
+ // worst case: there is no information about template, we will use chatml by default
12840
+ curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
12841
+ } else {
12842
+ curr_tmpl = std::string(model_template.data(), model_template.size());
12843
+ }
12844
+ }
12845
+ // format the chat to string
12846
+ std::vector<const llama_chat_message *> chat_vec;
12847
+ chat_vec.resize(n_msg);
12848
+ for (size_t i = 0; i < n_msg; i++) {
12849
+ chat_vec[i] = &chat[i];
12850
+ }
12851
+ std::string formatted_chat;
12852
+ int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12853
+ if (res < 0) {
12854
+ return res;
12855
+ }
12856
+ strncpy(buf, formatted_chat.c_str(), length);
12857
+ return res;
12858
+ }
12859
+
11812
12860
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
11813
12861
  struct llama_timings result = {
11814
12862
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
@@ -11867,6 +12915,7 @@ const char * llama_print_system_info(void) {
11867
12915
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
11868
12916
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
11869
12917
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
12918
+ s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
11870
12919
 
11871
12920
  return s.c_str();
11872
12921
  }