llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,10 +68,12 @@
68
68
  #include <cstdio>
69
69
  #include <cstring>
70
70
  #include <ctime>
71
+ #include <cwctype>
71
72
  #include <forward_list>
72
73
  #include <fstream>
73
74
  #include <functional>
74
75
  #include <initializer_list>
76
+ #include <locale>
75
77
  #include <map>
76
78
  #include <memory>
77
79
  #include <mutex>
@@ -197,6 +199,7 @@ enum llm_arch {
197
199
  LLM_ARCH_PERSIMMON,
198
200
  LLM_ARCH_REFACT,
199
201
  LLM_ARCH_BERT,
202
+ LLM_ARCH_NOMIC_BERT,
200
203
  LLM_ARCH_BLOOM,
201
204
  LLM_ARCH_STABLELM,
202
205
  LLM_ARCH_QWEN,
@@ -207,31 +210,34 @@ enum llm_arch {
207
210
  LLM_ARCH_ORION,
208
211
  LLM_ARCH_INTERNLM2,
209
212
  LLM_ARCH_MINICPM,
213
+ LLM_ARCH_GEMMA,
210
214
  LLM_ARCH_UNKNOWN,
211
215
  };
212
216
 
213
217
  static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
214
- { LLM_ARCH_LLAMA, "llama" },
215
- { LLM_ARCH_FALCON, "falcon" },
216
- { LLM_ARCH_GPT2, "gpt2" },
217
- { LLM_ARCH_GPTJ, "gptj" },
218
- { LLM_ARCH_GPTNEOX, "gptneox" },
219
- { LLM_ARCH_MPT, "mpt" },
220
- { LLM_ARCH_BAICHUAN, "baichuan" },
221
- { LLM_ARCH_STARCODER, "starcoder" },
222
- { LLM_ARCH_PERSIMMON, "persimmon" },
223
- { LLM_ARCH_REFACT, "refact" },
224
- { LLM_ARCH_BERT, "bert" },
225
- { LLM_ARCH_BLOOM, "bloom" },
226
- { LLM_ARCH_STABLELM, "stablelm" },
227
- { LLM_ARCH_QWEN, "qwen" },
228
- { LLM_ARCH_QWEN2, "qwen2" },
229
- { LLM_ARCH_PHI2, "phi2" },
230
- { LLM_ARCH_PLAMO, "plamo" },
231
- { LLM_ARCH_CODESHELL, "codeshell" },
232
- { LLM_ARCH_ORION, "orion" },
233
- { LLM_ARCH_INTERNLM2, "internlm2" },
234
- { LLM_ARCH_MINICPM, "minicpm" },
218
+ { LLM_ARCH_LLAMA, "llama" },
219
+ { LLM_ARCH_FALCON, "falcon" },
220
+ { LLM_ARCH_GPT2, "gpt2" },
221
+ { LLM_ARCH_GPTJ, "gptj" },
222
+ { LLM_ARCH_GPTNEOX, "gptneox" },
223
+ { LLM_ARCH_MPT, "mpt" },
224
+ { LLM_ARCH_BAICHUAN, "baichuan" },
225
+ { LLM_ARCH_STARCODER, "starcoder" },
226
+ { LLM_ARCH_PERSIMMON, "persimmon" },
227
+ { LLM_ARCH_REFACT, "refact" },
228
+ { LLM_ARCH_BERT, "bert" },
229
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
230
+ { LLM_ARCH_BLOOM, "bloom" },
231
+ { LLM_ARCH_STABLELM, "stablelm" },
232
+ { LLM_ARCH_QWEN, "qwen" },
233
+ { LLM_ARCH_QWEN2, "qwen2" },
234
+ { LLM_ARCH_PHI2, "phi2" },
235
+ { LLM_ARCH_PLAMO, "plamo" },
236
+ { LLM_ARCH_CODESHELL, "codeshell" },
237
+ { LLM_ARCH_ORION, "orion" },
238
+ { LLM_ARCH_INTERNLM2, "internlm2" },
239
+ { LLM_ARCH_MINICPM, "minicpm" },
240
+ { LLM_ARCH_GEMMA, "gemma" },
235
241
  };
236
242
 
237
243
  enum llm_kv {
@@ -254,7 +260,7 @@ enum llm_kv {
254
260
  LLM_KV_TENSOR_DATA_LAYOUT,
255
261
  LLM_KV_EXPERT_COUNT,
256
262
  LLM_KV_EXPERT_USED_COUNT,
257
- LLM_KV_POOLING_LAYER,
263
+ LLM_KV_POOLING_TYPE,
258
264
 
259
265
  LLM_KV_ATTENTION_HEAD_COUNT,
260
266
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -312,7 +318,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
312
318
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
313
319
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
314
320
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
315
- { LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
321
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
316
322
 
317
323
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
318
324
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -375,6 +381,7 @@ enum llm_tensor {
375
381
  LLM_TENSOR_ATTN_OUT,
376
382
  LLM_TENSOR_ATTN_NORM,
377
383
  LLM_TENSOR_ATTN_NORM_2,
384
+ LLM_TENSOR_ATTN_OUT_NORM,
378
385
  LLM_TENSOR_ATTN_ROT_EMBD,
379
386
  LLM_TENSOR_FFN_GATE_INP,
380
387
  LLM_TENSOR_FFN_NORM,
@@ -387,6 +394,7 @@ enum llm_tensor {
387
394
  LLM_TENSOR_FFN_UP_EXP,
388
395
  LLM_TENSOR_ATTN_Q_NORM,
389
396
  LLM_TENSOR_ATTN_K_NORM,
397
+ LLM_TENSOR_LAYER_OUT_NORM,
390
398
  };
391
399
 
392
400
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -503,7 +511,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
503
511
  {
504
512
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
505
513
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
506
- { LLM_TENSOR_OUTPUT, "output" },
507
514
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
508
515
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
509
516
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -552,12 +559,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
552
559
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
553
560
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
554
561
  { LLM_TENSOR_POS_EMBD, "position_embd" },
555
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
562
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
556
563
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
557
564
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
558
565
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
559
566
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
560
- { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
567
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
568
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
569
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
570
+ },
571
+ },
572
+ {
573
+ LLM_ARCH_NOMIC_BERT,
574
+ {
575
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
576
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
577
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
578
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
579
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
580
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
581
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
582
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
561
583
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
562
584
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
563
585
  },
@@ -741,6 +763,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
741
763
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
742
764
  },
743
765
  },
766
+ {
767
+ LLM_ARCH_GEMMA,
768
+ {
769
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
770
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
771
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
772
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
773
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
774
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
775
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
776
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
777
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
778
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
779
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
780
+ },
781
+ },
744
782
  {
745
783
  LLM_ARCH_UNKNOWN,
746
784
  {
@@ -814,9 +852,9 @@ struct LLM_TN {
814
852
  //
815
853
 
816
854
  static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
817
- { LLAMA_ROPE_SCALING_NONE, "none" },
818
- { LLAMA_ROPE_SCALING_LINEAR, "linear" },
819
- { LLAMA_ROPE_SCALING_YARN, "yarn" },
855
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
856
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
857
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
820
858
  };
821
859
 
822
860
  static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -826,7 +864,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
826
864
  }
827
865
  }
828
866
 
829
- return LLAMA_ROPE_SCALING_UNSPECIFIED;
867
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
830
868
  }
831
869
 
832
870
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1015,7 +1053,7 @@ struct llama_mmap {
1015
1053
  int fd = fileno(file->fp);
1016
1054
  int flags = MAP_SHARED;
1017
1055
  // prefetch/readahead impairs performance on NUMA systems
1018
- if (numa) { prefetch = 0; }
1056
+ if (numa) { prefetch = 0; }
1019
1057
  #ifdef __linux__
1020
1058
  // advise the kernel to read the file sequentially (increases readahead)
1021
1059
  if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -1485,6 +1523,7 @@ enum e_model {
1485
1523
  MODEL_22M,
1486
1524
  MODEL_33M,
1487
1525
  MODEL_109M,
1526
+ MODEL_137M,
1488
1527
  MODEL_335M,
1489
1528
  MODEL_0_5B,
1490
1529
  MODEL_1B,
@@ -1513,8 +1552,9 @@ static const size_t MiB = 1024*kiB;
1513
1552
  static const size_t GiB = 1024*MiB;
1514
1553
 
1515
1554
  struct llama_hparams {
1516
- bool vocab_only;
1517
- bool rope_finetuned;
1555
+ bool vocab_only;
1556
+ bool rope_finetuned;
1557
+
1518
1558
  uint32_t n_vocab;
1519
1559
  uint32_t n_ctx_train; // context size the model was trained on
1520
1560
  uint32_t n_embd;
@@ -1537,12 +1577,14 @@ struct llama_hparams {
1537
1577
  uint32_t n_yarn_orig_ctx;
1538
1578
  int32_t rope_scaling_type_train;
1539
1579
 
1540
- float f_clamp_kqv;
1541
- float f_max_alibi_bias;
1580
+ float f_clamp_kqv = 0.0f;
1581
+ float f_max_alibi_bias = 0.0f;
1542
1582
 
1543
1583
  bool causal_attn = true;
1544
- bool pooling_layer = false;
1584
+ bool need_kq_pos = false;
1545
1585
 
1586
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1587
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1546
1588
 
1547
1589
  bool operator!=(const llama_hparams & other) const {
1548
1590
  if (this->vocab_only != other.vocab_only) return true;
@@ -1601,8 +1643,8 @@ struct llama_cparams {
1601
1643
  float yarn_attn_factor;
1602
1644
  float yarn_beta_fast;
1603
1645
  float yarn_beta_slow;
1646
+ float defrag_thold;
1604
1647
 
1605
- bool mul_mat_q;
1606
1648
  bool offload_kqv;
1607
1649
  bool do_pooling;
1608
1650
 
@@ -1620,6 +1662,8 @@ struct llama_layer {
1620
1662
  struct ggml_tensor * attn_q_norm_b;
1621
1663
  struct ggml_tensor * attn_k_norm;
1622
1664
  struct ggml_tensor * attn_k_norm_b;
1665
+ struct ggml_tensor * attn_out_norm;
1666
+ struct ggml_tensor * attn_out_norm_b;
1623
1667
 
1624
1668
  // attention
1625
1669
  struct ggml_tensor * wq;
@@ -1638,6 +1682,8 @@ struct llama_layer {
1638
1682
  // normalization
1639
1683
  struct ggml_tensor * ffn_norm;
1640
1684
  struct ggml_tensor * ffn_norm_b;
1685
+ struct ggml_tensor * layer_out_norm;
1686
+ struct ggml_tensor * layer_out_norm_b;
1641
1687
 
1642
1688
  // ff
1643
1689
  struct ggml_tensor * ffn_gate; // w1
@@ -1665,11 +1711,20 @@ struct llama_kv_cell {
1665
1711
  bool has_seq_id(const llama_seq_id & id) const {
1666
1712
  return seq_id.find(id) != seq_id.end();
1667
1713
  }
1714
+
1715
+ bool is_empty() const {
1716
+ return seq_id.empty();
1717
+ }
1718
+
1719
+ bool is_same_seq(const llama_kv_cell & other) const {
1720
+ return seq_id == other.seq_id;
1721
+ }
1668
1722
  };
1669
1723
 
1670
1724
  // ring-buffer of cached KV data
1671
1725
  struct llama_kv_cache {
1672
1726
  bool has_shift = false;
1727
+ bool do_defrag = false;
1673
1728
 
1674
1729
  // Note: The value of head isn't only used to optimize searching
1675
1730
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1681,6 +1736,9 @@ struct llama_kv_cache {
1681
1736
  // computed before each graph build
1682
1737
  uint32_t n = 0;
1683
1738
 
1739
+ ggml_type type_k = GGML_TYPE_F16;
1740
+ ggml_type type_v = GGML_TYPE_F16;
1741
+
1684
1742
  std::vector<llama_kv_cell> cells;
1685
1743
 
1686
1744
  std::vector<struct ggml_tensor *> k_l; // per layer
@@ -1899,8 +1957,10 @@ struct llama_context {
1899
1957
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1900
1958
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1901
1959
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1960
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1902
1961
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1903
- struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1962
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1963
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
1904
1964
 
1905
1965
  #ifdef GGML_USE_MPI
1906
1966
  ggml_mpi_context * ctx_mpi = NULL;
@@ -1914,8 +1974,8 @@ struct llama_context {
1914
1974
  static bool llama_kv_cache_init(
1915
1975
  struct llama_kv_cache & cache,
1916
1976
  const llama_model & model,
1917
- ggml_type ktype,
1918
- ggml_type vtype,
1977
+ ggml_type type_k,
1978
+ ggml_type type_v,
1919
1979
  uint32_t n_ctx,
1920
1980
  bool offload) {
1921
1981
  const struct llama_hparams & hparams = model.hparams;
@@ -1930,6 +1990,9 @@ static bool llama_kv_cache_init(
1930
1990
  cache.size = n_ctx;
1931
1991
  cache.used = 0;
1932
1992
 
1993
+ cache.type_k = type_k;
1994
+ cache.type_v = type_v;
1995
+
1933
1996
  cache.cells.clear();
1934
1997
  cache.cells.resize(n_ctx);
1935
1998
 
@@ -1970,8 +2033,8 @@ static bool llama_kv_cache_init(
1970
2033
 
1971
2034
  for (int i = 0; i < (int) n_layer; i++) {
1972
2035
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
1973
- ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
1974
- ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
2036
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2037
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
1975
2038
  ggml_format_name(k, "cache_k_l%d", i);
1976
2039
  ggml_format_name(v, "cache_v_l%d", i);
1977
2040
  cache.k_l.push_back(k);
@@ -2055,7 +2118,7 @@ static bool llama_kv_cache_find_slot(
2055
2118
  // find how many cells are currently in use
2056
2119
  static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2057
2120
  for (uint32_t i = cache.size - 1; i > 0; --i) {
2058
- if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
2121
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
2059
2122
  return i + 1;
2060
2123
  }
2061
2124
  }
@@ -2091,7 +2154,7 @@ static void llama_kv_cache_seq_rm(
2091
2154
  } else {
2092
2155
  continue;
2093
2156
  }
2094
- if (cache.cells[i].seq_id.empty()) {
2157
+ if (cache.cells[i].is_empty()) {
2095
2158
  // keep count of the number of used cells
2096
2159
  if (cache.cells[i].pos >= 0) cache.used--;
2097
2160
 
@@ -2142,7 +2205,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
2142
2205
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2143
2206
  }
2144
2207
 
2145
- static void llama_kv_cache_seq_shift(
2208
+ static void llama_kv_cache_seq_add(
2146
2209
  struct llama_kv_cache & cache,
2147
2210
  llama_seq_id seq_id,
2148
2211
  llama_pos p0,
@@ -2160,10 +2223,14 @@ static void llama_kv_cache_seq_shift(
2160
2223
  cache.cells[i].delta += delta;
2161
2224
 
2162
2225
  if (cache.cells[i].pos < 0) {
2163
- if (!cache.cells[i].seq_id.empty()) cache.used--;
2226
+ if (!cache.cells[i].is_empty()) {
2227
+ cache.used--;
2228
+ }
2164
2229
  cache.cells[i].pos = -1;
2165
2230
  cache.cells[i].seq_id.clear();
2166
- if (new_head == cache.size) new_head = i;
2231
+ if (new_head == cache.size) {
2232
+ new_head = i;
2233
+ }
2167
2234
  }
2168
2235
  }
2169
2236
  }
@@ -2195,6 +2262,22 @@ static void llama_kv_cache_seq_div(
2195
2262
  }
2196
2263
  }
2197
2264
 
2265
+ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
2266
+ llama_pos result = 0;
2267
+
2268
+ for (uint32_t i = 0; i < cache.size; ++i) {
2269
+ if (cache.cells[i].has_seq_id(seq_id)) {
2270
+ result = std::max(result, cache.cells[i].pos);
2271
+ }
2272
+ }
2273
+
2274
+ return result;
2275
+ }
2276
+
2277
+ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2278
+ cache.do_defrag = true;
2279
+ }
2280
+
2198
2281
  //
2199
2282
  // model loading and saving
2200
2283
  //
@@ -2266,7 +2349,7 @@ namespace GGUFMeta {
2266
2349
  }
2267
2350
  };
2268
2351
 
2269
- struct ArrayInfo{
2352
+ struct ArrayInfo {
2270
2353
  const gguf_type gt;
2271
2354
  const size_t length;
2272
2355
  const void * data;
@@ -2285,7 +2368,7 @@ namespace GGUFMeta {
2285
2368
  };
2286
2369
 
2287
2370
  template<typename T>
2288
- class GKV: public GKV_Base<T> {
2371
+ class GKV : public GKV_Base<T> {
2289
2372
  GKV() = delete;
2290
2373
 
2291
2374
  public:
@@ -2301,46 +2384,46 @@ namespace GGUFMeta {
2301
2384
 
2302
2385
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2303
2386
  switch (ty) {
2304
- case LLAMA_KV_OVERRIDE_BOOL: return "bool";
2305
- case LLAMA_KV_OVERRIDE_INT: return "int";
2306
- case LLAMA_KV_OVERRIDE_FLOAT: return "float";
2387
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2388
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2389
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2307
2390
  }
2308
2391
  return "unknown";
2309
2392
  }
2310
2393
 
2311
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
2312
- if (!override) { return false; }
2313
- if (override->tag == expected_type) {
2394
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
2395
+ if (!ovrd) { return false; }
2396
+ if (ovrd->tag == expected_type) {
2314
2397
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2315
- __func__, override_type_to_str(override->tag), override->key);
2316
- switch (override->tag) {
2317
- case LLAMA_KV_OVERRIDE_BOOL: {
2318
- LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2398
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
2399
+ switch (ovrd->tag) {
2400
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2401
+ LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2319
2402
  } break;
2320
- case LLAMA_KV_OVERRIDE_INT: {
2321
- LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2403
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
2404
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2322
2405
  } break;
2323
- case LLAMA_KV_OVERRIDE_FLOAT: {
2324
- LLAMA_LOG_INFO("%.6f\n", override->float_value);
2406
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2407
+ LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2325
2408
  } break;
2326
2409
  default:
2327
2410
  // Shouldn't be possible to end up here, but just in case...
2328
2411
  throw std::runtime_error(
2329
2412
  format("Unsupported attempt to override %s type for metadata key %s\n",
2330
- override_type_to_str(override->tag), override->key));
2413
+ override_type_to_str(ovrd->tag), ovrd->key));
2331
2414
  }
2332
2415
  return true;
2333
2416
  }
2334
2417
  LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
2335
- __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
2418
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
2336
2419
  return false;
2337
2420
  }
2338
2421
 
2339
2422
  template<typename OT>
2340
2423
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2341
- try_override(OT & target, const struct llama_model_kv_override *override) {
2342
- if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
2343
- target = override->bool_value;
2424
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2425
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2426
+ target = ovrd->bool_value;
2344
2427
  return true;
2345
2428
  }
2346
2429
  return false;
@@ -2348,9 +2431,9 @@ namespace GGUFMeta {
2348
2431
 
2349
2432
  template<typename OT>
2350
2433
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2351
- try_override(OT & target, const struct llama_model_kv_override *override) {
2352
- if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
2353
- target = override->int_value;
2434
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2435
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2436
+ target = ovrd->int_value;
2354
2437
  return true;
2355
2438
  }
2356
2439
  return false;
@@ -2358,9 +2441,9 @@ namespace GGUFMeta {
2358
2441
 
2359
2442
  template<typename OT>
2360
2443
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2361
- try_override(T & target, const struct llama_model_kv_override *override) {
2362
- if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
2363
- target = override->float_value;
2444
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2445
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2446
+ target = ovrd->float_value;
2364
2447
  return true;
2365
2448
  }
2366
2449
  return false;
@@ -2368,17 +2451,17 @@ namespace GGUFMeta {
2368
2451
 
2369
2452
  template<typename OT>
2370
2453
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2371
- try_override(T & target, const struct llama_model_kv_override *override) {
2454
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2372
2455
  (void)target;
2373
- (void)override;
2374
- if (!override) { return false; }
2456
+ (void)ovrd;
2457
+ if (!ovrd) { return false; }
2375
2458
  // Currently, we should never end up here so it would be a bug if we do.
2376
2459
  throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2377
- override ? override->key : "NULL"));
2460
+ ovrd ? ovrd->key : "NULL"));
2378
2461
  }
2379
2462
 
2380
- static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
2381
- if (try_override<T>(target, override)) {
2463
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2464
+ if (try_override<T>(target, ovrd)) {
2382
2465
  return true;
2383
2466
  }
2384
2467
  if (k < 0) { return false; }
@@ -2386,12 +2469,12 @@ namespace GGUFMeta {
2386
2469
  return true;
2387
2470
  }
2388
2471
 
2389
- static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
2390
- return set(ctx, gguf_find_key(ctx, key), target, override);
2472
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2473
+ return set(ctx, gguf_find_key(ctx, key), target, ovrd);
2391
2474
  }
2392
2475
 
2393
- static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
2394
- return set(ctx, key.c_str(), target, override);
2476
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2477
+ return set(ctx, key.c_str(), target, ovrd);
2395
2478
  }
2396
2479
  };
2397
2480
  }
@@ -2498,7 +2581,12 @@ struct llama_model_loader {
2498
2581
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2499
2582
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2500
2583
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2584
+ case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2501
2585
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2586
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2587
+ case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2588
+ case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2589
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
2502
2590
  default:
2503
2591
  {
2504
2592
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2744,13 +2832,7 @@ struct llama_model_loader {
2744
2832
 
2745
2833
  std::vector<no_init<uint8_t>> read_buf;
2746
2834
 
2747
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2748
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2749
- if (!cur) {
2750
- // some tensors may be allocated in a different context
2751
- continue;
2752
- }
2753
-
2835
+ for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
2754
2836
  if (progress_callback) {
2755
2837
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2756
2838
  return false;
@@ -2805,6 +2887,15 @@ struct llama_model_loader {
2805
2887
  }
2806
2888
  };
2807
2889
 
2890
+ template<>
2891
+ bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2892
+ uint32_t tmp;
2893
+ const bool found = get_key(kid, tmp, required);
2894
+ result = (enum llama_pooling_type) tmp;
2895
+ return found;
2896
+ }
2897
+
2898
+
2808
2899
  //
2809
2900
  // load LLaMA models
2810
2901
  //
@@ -2846,8 +2937,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2846
2937
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2847
2938
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2848
2939
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2849
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2940
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
2941
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
2942
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
2850
2943
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2944
+ case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2945
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2946
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
2947
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
2948
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
2851
2949
 
2852
2950
  default: return "unknown, may not work";
2853
2951
  }
@@ -2855,6 +2953,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2855
2953
 
2856
2954
  static const char * llama_model_type_name(e_model type) {
2857
2955
  switch (type) {
2956
+ case MODEL_22M: return "22M";
2957
+ case MODEL_33M: return "33M";
2958
+ case MODEL_109M: return "109M";
2959
+ case MODEL_137M: return "137M";
2960
+ case MODEL_0_5B: return "0.5B";
2858
2961
  case MODEL_1B: return "1B";
2859
2962
  case MODEL_2B: return "2B";
2860
2963
  case MODEL_3B: return "3B";
@@ -2876,16 +2979,16 @@ static const char * llama_model_type_name(e_model type) {
2876
2979
  default: return "?B";
2877
2980
  }
2878
2981
  }
2982
+
2879
2983
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2880
2984
  switch (type) {
2881
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2882
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2883
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2884
- default: return "unknown";
2985
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2986
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2987
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2988
+ default: return "unknown";
2885
2989
  }
2886
2990
  }
2887
2991
 
2888
-
2889
2992
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2890
2993
  model.arch = ml.get_arch();
2891
2994
  if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -2949,7 +3052,7 @@ static void llm_load_hparams(
2949
3052
  std::string rope_scaling("linear");
2950
3053
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2951
3054
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2952
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
3055
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
2953
3056
 
2954
3057
  // rope_freq_scale (inverse of the kv) is optional
2955
3058
  float ropescale = 0.0f;
@@ -3024,6 +3127,11 @@ static void llm_load_hparams(
3024
3127
  case 40: model.type = e_model::MODEL_13B; break;
3025
3128
  default: model.type = e_model::MODEL_UNKNOWN;
3026
3129
  }
3130
+
3131
+ if (model.type == e_model::MODEL_13B) {
3132
+ // TODO: become GGUF KV parameter
3133
+ hparams.f_max_alibi_bias = 8.0f;
3134
+ }
3027
3135
  } break;
3028
3136
  case LLM_ARCH_STARCODER:
3029
3137
  {
@@ -3051,13 +3159,16 @@ static void llm_load_hparams(
3051
3159
  case 32: model.type = e_model::MODEL_1B; break;
3052
3160
  default: model.type = e_model::MODEL_UNKNOWN;
3053
3161
  }
3162
+
3163
+ // TODO: become GGUF KV parameter
3164
+ hparams.f_max_alibi_bias = 8.0f;
3054
3165
  } break;
3055
3166
  case LLM_ARCH_BERT:
3056
3167
  {
3057
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3058
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3168
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3169
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3059
3170
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3060
- ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3171
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3061
3172
 
3062
3173
  switch (hparams.n_layer) {
3063
3174
  case 3:
@@ -3073,6 +3184,17 @@ static void llm_load_hparams(
3073
3184
  model.type = e_model::MODEL_335M; break; // bge-large
3074
3185
  }
3075
3186
  } break;
3187
+ case LLM_ARCH_NOMIC_BERT:
3188
+ {
3189
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3190
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3191
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3192
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3193
+
3194
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3195
+ model.type = e_model::MODEL_137M;
3196
+ }
3197
+ } break;
3076
3198
  case LLM_ARCH_BLOOM:
3077
3199
  {
3078
3200
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3085,11 +3207,12 @@ static void llm_load_hparams(
3085
3207
  case 4096: model.type = e_model::MODEL_7B; break;
3086
3208
  } break;
3087
3209
  }
3210
+
3211
+ // TODO: become GGUF KV parameter
3212
+ hparams.f_max_alibi_bias = 8.0f;
3088
3213
  } break;
3089
3214
  case LLM_ARCH_MPT:
3090
3215
  {
3091
- hparams.f_clamp_kqv = 0.0f;
3092
-
3093
3216
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3094
3217
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
3095
3218
  ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
@@ -3187,10 +3310,26 @@ static void llm_load_hparams(
3187
3310
  default: model.type = e_model::MODEL_UNKNOWN;
3188
3311
  }
3189
3312
  } break;
3313
+ case LLM_ARCH_GEMMA:
3314
+ {
3315
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3316
+
3317
+ switch (hparams.n_layer) {
3318
+ case 18: model.type = e_model::MODEL_2B; break;
3319
+ case 28: model.type = e_model::MODEL_7B; break;
3320
+ default: model.type = e_model::MODEL_UNKNOWN;
3321
+ }
3322
+ } break;
3190
3323
  default: (void)0;
3191
3324
  }
3192
3325
 
3193
3326
  model.ftype = ml.ftype;
3327
+
3328
+ if (hparams.f_max_alibi_bias > 0.0f) {
3329
+ hparams.need_kq_pos = true;
3330
+ }
3331
+
3332
+ hparams.rope_type = llama_rope_type(&model);
3194
3333
  }
3195
3334
 
3196
3335
  // TODO: This should probably be in llama.h
@@ -3493,6 +3632,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3493
3632
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3494
3633
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3495
3634
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3635
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3636
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3496
3637
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3497
3638
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3498
3639
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3559,7 +3700,7 @@ static bool llm_load_tensors(
3559
3700
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3560
3701
  }
3561
3702
 
3562
- if (split_mode == LLAMA_SPLIT_LAYER) {
3703
+ if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
3563
3704
  // calculate the split points
3564
3705
  int device_count = llama_get_device_count();
3565
3706
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3598,10 +3739,10 @@ static bool llm_load_tensors(
3598
3739
  }
3599
3740
  } else {
3600
3741
  ggml_backend_buffer_type_t split_buft;
3601
- if (split_mode == LLAMA_SPLIT_ROW) {
3742
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
3602
3743
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3603
3744
  } else {
3604
- // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3745
+ // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3605
3746
  split_buft = llama_default_buffer_type_offload(main_gpu);
3606
3747
  }
3607
3748
  // assign the repeating layers
@@ -3634,7 +3775,7 @@ static bool llm_load_tensors(
3634
3775
  }
3635
3776
 
3636
3777
  // create one context per buffer type
3637
- size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3778
+ size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
3638
3779
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3639
3780
  for (auto & it : buft_layer_count) {
3640
3781
  struct ggml_init_params params = {
@@ -3772,6 +3913,7 @@ static bool llm_load_tensors(
3772
3913
  } else {
3773
3914
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3774
3915
  ml.n_created--; // artificial tensor
3916
+ ml.size_data += ggml_nbytes(model.output);
3775
3917
  }
3776
3918
  }
3777
3919
 
@@ -3875,10 +4017,14 @@ static bool llm_load_tensors(
3875
4017
  }
3876
4018
  } break;
3877
4019
  case LLM_ARCH_BERT:
4020
+ case LLM_ARCH_NOMIC_BERT:
3878
4021
  {
3879
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3880
- model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3881
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
4022
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4023
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
4024
+ if (model.arch == LLM_ARCH_BERT) {
4025
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
4026
+ }
4027
+
3882
4028
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3883
4029
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3884
4030
 
@@ -3888,29 +4034,38 @@ static bool llm_load_tensors(
3888
4034
 
3889
4035
  auto & layer = model.layers[i];
3890
4036
 
3891
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3892
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4037
+ if (model.arch == LLM_ARCH_BERT) {
4038
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4039
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3893
4040
 
3894
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3895
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4041
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4042
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3896
4043
 
3897
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3898
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4044
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4045
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4046
+ } else {
4047
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4048
+ }
3899
4049
 
3900
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3901
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4050
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3902
4051
 
3903
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3904
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4052
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4053
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
3905
4054
 
3906
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3907
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4055
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4056
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3908
4057
 
3909
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3910
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
4058
+ if (model.arch == LLM_ARCH_BERT) {
4059
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4060
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3911
4061
 
3912
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3913
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4062
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4063
+ } else {
4064
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4065
+ }
4066
+
4067
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4068
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
3914
4069
  }
3915
4070
  } break;
3916
4071
  case LLM_ARCH_BLOOM:
@@ -3958,7 +4113,12 @@ static bool llm_load_tensors(
3958
4113
  // output
3959
4114
  {
3960
4115
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3961
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4116
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4117
+
4118
+ // same as tok_embd, duplicated to allow offloading
4119
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4120
+ ml.n_created--; // artificial tensor
4121
+ ml.size_data += ggml_nbytes(model.output);
3962
4122
  }
3963
4123
 
3964
4124
  for (int i = 0; i < n_layer; ++i) {
@@ -3967,14 +4127,23 @@ static bool llm_load_tensors(
3967
4127
 
3968
4128
  auto & layer = model.layers[i];
3969
4129
 
3970
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4130
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4131
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
3971
4132
 
3972
4133
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4134
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
4135
+
3973
4136
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4137
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
3974
4138
 
3975
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3976
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3977
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4139
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4140
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4141
+
4142
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4143
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
4144
+
4145
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4146
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
3978
4147
 
3979
4148
  // AWQ ScaleActivation layer
3980
4149
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -4287,6 +4456,40 @@ static bool llm_load_tensors(
4287
4456
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4288
4457
  }
4289
4458
  } break;
4459
+ case LLM_ARCH_GEMMA:
4460
+ {
4461
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4462
+
4463
+ // output
4464
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4465
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
4466
+ ml.n_created--; // artificial tensor
4467
+ ml.size_data += ggml_nbytes(model.output);
4468
+
4469
+ const int64_t n_ff = hparams.n_ff;
4470
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4471
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4472
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4473
+
4474
+ for (uint32_t i = 0; i < n_layer; ++i) {
4475
+ ggml_context * ctx_layer = ctx_for_layer(i);
4476
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4477
+
4478
+ auto & layer = model.layers[i];
4479
+
4480
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4481
+
4482
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
4483
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
4484
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
4485
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
4486
+
4487
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4488
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4489
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4490
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4491
+ }
4492
+ } break;
4290
4493
  default:
4291
4494
  throw std::runtime_error("unknown architecture");
4292
4495
  }
@@ -4452,12 +4655,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4452
4655
 
4453
4656
  using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
4454
4657
 
4455
- enum llm_rope_type {
4456
- LLM_ROPE,
4457
- LLM_ROPE_NEOX,
4458
- LLM_ROPE_GLM,
4459
- };
4460
-
4461
4658
  enum llm_ffn_op_type {
4462
4659
  LLM_FFN_SILU,
4463
4660
  LLM_FFN_GELU,
@@ -4503,55 +4700,6 @@ static struct ggml_tensor * llm_build_inp_embd(
4503
4700
  return inpL;
4504
4701
  }
4505
4702
 
4506
- // Persimmon: n_rot = n_embd_head_k/2
4507
- // Other: n_rot = n_embd_head_k
4508
- static void llm_build_k_shift(
4509
- struct ggml_context * ctx,
4510
- const llama_hparams & hparams,
4511
- const llama_cparams & cparams,
4512
- const llama_kv_cache & kv,
4513
- struct ggml_cgraph * graph,
4514
- struct ggml_tensor * K_shift,
4515
- llm_rope_type type,
4516
- int64_t n_ctx,
4517
- float freq_base,
4518
- float freq_scale,
4519
- const llm_build_cb & cb) {
4520
- const int64_t n_layer = hparams.n_layer;
4521
- const int64_t n_head_kv = hparams.n_head_kv;
4522
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
4523
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4524
- const int32_t n_rot = hparams.n_rot;
4525
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4526
- const float ext_factor = cparams.yarn_ext_factor;
4527
- const float attn_factor = cparams.yarn_attn_factor;
4528
- const float beta_fast = cparams.yarn_beta_fast;
4529
- const float beta_slow = cparams.yarn_beta_slow;
4530
-
4531
- int rope_type = 0;
4532
-
4533
- switch (type) {
4534
- case LLM_ROPE: rope_type = 0; break;
4535
- case LLM_ROPE_NEOX: rope_type = 2; break;
4536
- case LLM_ROPE_GLM: rope_type = 4; break;
4537
- }
4538
-
4539
- for (int il = 0; il < n_layer; ++il) {
4540
- struct ggml_tensor * tmp =
4541
- // we rotate only the first n_rot dimensions
4542
- ggml_rope_custom_inplace(ctx,
4543
- ggml_view_3d(ctx, kv.k_l[il],
4544
- n_embd_head_k, n_head_kv, n_ctx,
4545
- ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4546
- ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4547
- 0),
4548
- K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4549
- ext_factor, attn_factor, beta_fast, beta_slow);
4550
- cb(tmp, "K_shifted", il);
4551
- ggml_build_forward_expand(graph, tmp);
4552
- }
4553
- }
4554
-
4555
4703
  static void llm_build_kv_store(
4556
4704
  struct ggml_context * ctx,
4557
4705
  const llama_hparams & hparams,
@@ -4720,10 +4868,10 @@ static struct ggml_tensor * llm_build_kqv(
4720
4868
  struct ggml_tensor * wo_b,
4721
4869
  struct ggml_tensor * q_cur,
4722
4870
  struct ggml_tensor * kq_mask,
4871
+ struct ggml_tensor * kq_pos,
4723
4872
  int64_t n_ctx,
4724
4873
  int32_t n_tokens,
4725
4874
  int32_t n_kv,
4726
- float max_alibi_bias,
4727
4875
  float kq_scale,
4728
4876
  const llm_build_cb & cb,
4729
4877
  int il) {
@@ -4753,26 +4901,26 @@ static struct ggml_tensor * llm_build_kqv(
4753
4901
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4754
4902
  }
4755
4903
 
4756
- if (max_alibi_bias > 0.0f) {
4757
- // temporary branch until we figure out how to handle ggml_alibi through ggml_add
4904
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
4905
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
4906
+ #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4907
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4908
+ if (hparams.f_max_alibi_bias > 0.0f) {
4758
4909
  kq = ggml_scale(ctx, kq, kq_scale);
4759
4910
  cb(kq, "kq_scaled", il);
4760
4911
 
4761
- if (max_alibi_bias > 0.0f) {
4762
- // TODO: n_head or n_head_kv
4763
- // TODO: K-shift is likely not working
4764
- // TODO: change to ggml_add
4765
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
4766
- cb(kq, "kq_scaled_alibi", il);
4767
- }
4912
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
4913
+ cb(kq, "kq_scaled_alibi", il);
4768
4914
 
4769
4915
  kq = ggml_add(ctx, kq, kq_mask);
4770
4916
  cb(kq, "kq_masked", il);
4771
4917
 
4772
4918
  kq = ggml_soft_max(ctx, kq);
4773
4919
  cb(kq, "kq_soft_max", il);
4774
- } else {
4775
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
4920
+ } else
4921
+ #endif
4922
+ {
4923
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
4776
4924
  cb(kq, "kq_soft_max_ext", il);
4777
4925
  }
4778
4926
 
@@ -4820,11 +4968,11 @@ static struct ggml_tensor * llm_build_kv(
4820
4968
  struct ggml_tensor * v_cur,
4821
4969
  struct ggml_tensor * q_cur,
4822
4970
  struct ggml_tensor * kq_mask,
4971
+ struct ggml_tensor * kq_pos,
4823
4972
  int64_t n_ctx,
4824
4973
  int32_t n_tokens,
4825
4974
  int32_t kv_head,
4826
4975
  int32_t n_kv,
4827
- float max_alibi_bias,
4828
4976
  float kq_scale,
4829
4977
  const llm_build_cb & cb,
4830
4978
  int il) {
@@ -4838,9 +4986,8 @@ static struct ggml_tensor * llm_build_kv(
4838
4986
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4839
4987
 
4840
4988
  struct ggml_tensor * cur;
4841
- cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4842
- wo, wo_b,
4843
- q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4989
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4990
+ q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4844
4991
  cb(cur, "kqv_out", il);
4845
4992
 
4846
4993
  return cur;
@@ -4856,6 +5003,7 @@ struct llm_build_context {
4856
5003
 
4857
5004
  const int64_t n_embd;
4858
5005
  const int64_t n_layer;
5006
+ const int64_t n_rot;
4859
5007
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4860
5008
  const int64_t n_head;
4861
5009
  const int64_t n_head_kv;
@@ -4880,8 +5028,8 @@ struct llm_build_context {
4880
5028
  const int32_t kv_head; // index of where we store new KV data in the cache
4881
5029
  const int32_t n_orig_ctx;
4882
5030
 
4883
- const bool do_rope_shift;
4884
- const bool do_pooling;
5031
+ const enum llama_pooling_type pooling_type;
5032
+ const enum llama_rope_type rope_type;
4885
5033
 
4886
5034
  const llm_build_cb & cb;
4887
5035
 
@@ -4903,6 +5051,7 @@ struct llm_build_context {
4903
5051
  kv_self (lctx.kv_self),
4904
5052
  n_embd (hparams.n_embd),
4905
5053
  n_layer (hparams.n_layer),
5054
+ n_rot (hparams.n_rot),
4906
5055
  n_ctx (cparams.n_ctx),
4907
5056
  n_head (hparams.n_head),
4908
5057
  n_head_kv (hparams.n_head_kv),
@@ -4924,8 +5073,8 @@ struct llm_build_context {
4924
5073
  n_kv (worst_case ? n_ctx : kv_self.n),
4925
5074
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4926
5075
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4927
- do_rope_shift (worst_case || kv_self.has_shift),
4928
- do_pooling (hparams.pooling_layer && cparams.do_pooling),
5076
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
5077
+ rope_type (hparams.rope_type),
4929
5078
  cb (cb),
4930
5079
  buf_compute_meta (lctx.buf_compute_meta) {
4931
5080
  // all initializations should be done in init()
@@ -4948,6 +5097,76 @@ struct llm_build_context {
4948
5097
  }
4949
5098
  }
4950
5099
 
5100
+ struct ggml_cgraph * build_k_shift() {
5101
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5102
+
5103
+ for (int il = 0; il < n_layer; ++il) {
5104
+ struct ggml_tensor * tmp =
5105
+ // we rotate only the first n_rot dimensions
5106
+ ggml_rope_custom_inplace(ctx0,
5107
+ ggml_view_3d(ctx0, kv_self.k_l[il],
5108
+ n_embd_head_k, n_head_kv, n_ctx,
5109
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
5110
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5111
+ 0),
5112
+ lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5113
+ ext_factor, attn_factor, beta_fast, beta_slow);
5114
+ cb(tmp, "K_shifted", il);
5115
+ ggml_build_forward_expand(gf, tmp);
5116
+ }
5117
+
5118
+ return gf;
5119
+ }
5120
+
5121
+ struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5122
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5123
+
5124
+ for (uint32_t i = 0; i < ids.size(); ++i) {
5125
+ const uint32_t id = ids[i];
5126
+
5127
+ if (i == id || id == ids.size()) {
5128
+ continue;
5129
+ }
5130
+
5131
+ uint32_t nm = 1;
5132
+
5133
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
5134
+ nm++;
5135
+ }
5136
+
5137
+ for (int il = 0; il < n_layer; ++il) {
5138
+ ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5139
+ n_embd_k_gqa, nm,
5140
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5141
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5142
+
5143
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5144
+ n_embd_k_gqa, nm,
5145
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5146
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5147
+
5148
+ ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5149
+ nm, n_embd_v_gqa,
5150
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5151
+ ggml_row_size(kv_self.v_l[il]->type, i));
5152
+
5153
+ ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5154
+ nm, n_embd_v_gqa,
5155
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5156
+ ggml_row_size(kv_self.v_l[il]->type, id));
5157
+
5158
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5159
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5160
+ }
5161
+
5162
+ i += nm - 1;
5163
+ }
5164
+
5165
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
5166
+
5167
+ return gf;
5168
+ }
5169
+
4951
5170
  struct ggml_cgraph * build_llama() {
4952
5171
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4953
5172
 
@@ -4969,11 +5188,6 @@ struct llm_build_context {
4969
5188
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4970
5189
  cb(KQ_mask, "KQ_mask", -1);
4971
5190
 
4972
- // shift the entire K-cache if needed
4973
- if (do_rope_shift) {
4974
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4975
- }
4976
-
4977
5191
  for (int il = 0; il < n_layer; ++il) {
4978
5192
  struct ggml_tensor * inpSA = inpL;
4979
5193
 
@@ -5008,22 +5222,22 @@ struct llm_build_context {
5008
5222
  }
5009
5223
 
5010
5224
  Qcur = ggml_rope_custom(
5011
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5012
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5225
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5226
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5013
5227
  ext_factor, attn_factor, beta_fast, beta_slow
5014
5228
  );
5015
5229
  cb(Qcur, "Qcur", il);
5016
5230
 
5017
5231
  Kcur = ggml_rope_custom(
5018
5232
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5019
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5233
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5020
5234
  ext_factor, attn_factor, beta_fast, beta_slow
5021
5235
  );
5022
5236
  cb(Kcur, "Kcur", il);
5023
5237
 
5024
5238
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5025
5239
  model.layers[il].wo, model.layers[il].bo,
5026
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5240
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5027
5241
  cb(cur, "kqv_out", il);
5028
5242
  }
5029
5243
 
@@ -5153,10 +5367,9 @@ struct llm_build_context {
5153
5367
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5154
5368
  cb(KQ_mask, "KQ_mask", -1);
5155
5369
 
5156
- // shift the entire K-cache if needed
5157
- if (do_rope_shift) {
5158
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5159
- }
5370
+ // positions of the tokens in the KV cache
5371
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5372
+ cb(KQ_pos, "KQ_pos", -1);
5160
5373
 
5161
5374
  for (int il = 0; il < n_layer; ++il) {
5162
5375
  struct ggml_tensor * inpSA = inpL;
@@ -5181,12 +5394,12 @@ struct llm_build_context {
5181
5394
  case MODEL_7B:
5182
5395
  Qcur = ggml_rope_custom(
5183
5396
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5184
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5397
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5185
5398
  ext_factor, attn_factor, beta_fast, beta_slow
5186
5399
  );
5187
5400
  Kcur = ggml_rope_custom(
5188
5401
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5189
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5402
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5190
5403
  ext_factor, attn_factor, beta_fast, beta_slow
5191
5404
  );
5192
5405
  break;
@@ -5201,12 +5414,9 @@ struct llm_build_context {
5201
5414
  cb(Kcur, "Kcur", il);
5202
5415
 
5203
5416
 
5204
- // apply ALiBi for 13B model
5205
- const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
5206
-
5207
5417
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5208
5418
  model.layers[il].wo, NULL,
5209
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5419
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5210
5420
  cb(cur, "kqv_out", il);
5211
5421
  }
5212
5422
 
@@ -5274,11 +5484,6 @@ struct llm_build_context {
5274
5484
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5275
5485
  cb(KQ_mask, "KQ_mask", -1);
5276
5486
 
5277
- // shift the entire K-cache if needed
5278
- if (do_rope_shift) {
5279
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5280
- }
5281
-
5282
5487
  for (int il = 0; il < n_layer; ++il) {
5283
5488
  struct ggml_tensor * attn_norm;
5284
5489
 
@@ -5317,20 +5522,20 @@ struct llm_build_context {
5317
5522
 
5318
5523
  // using mode = 2 for neox mode
5319
5524
  Qcur = ggml_rope_custom(
5320
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5525
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5321
5526
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5322
5527
  );
5323
5528
  cb(Qcur, "Qcur", il);
5324
5529
 
5325
5530
  Kcur = ggml_rope_custom(
5326
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5531
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5327
5532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5328
5533
  );
5329
5534
  cb(Kcur, "Kcur", il);
5330
5535
 
5331
5536
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5332
5537
  model.layers[il].wo, NULL,
5333
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5538
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5334
5539
  cb(cur, "kqv_out", il);
5335
5540
  }
5336
5541
 
@@ -5429,7 +5634,7 @@ struct llm_build_context {
5429
5634
 
5430
5635
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5431
5636
  model.layers[il].wo, model.layers[il].bo,
5432
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5637
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5433
5638
  cb(cur, "kqv_out", il);
5434
5639
  }
5435
5640
 
@@ -5493,10 +5698,6 @@ struct llm_build_context {
5493
5698
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5494
5699
  cb(KQ_mask, "KQ_mask", -1);
5495
5700
 
5496
- if (do_rope_shift) {
5497
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5498
- }
5499
-
5500
5701
  for (int il = 0; il < n_layer; ++il) {
5501
5702
  struct ggml_tensor * residual = inpL;
5502
5703
 
@@ -5554,7 +5755,7 @@ struct llm_build_context {
5554
5755
 
5555
5756
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5556
5757
  struct ggml_tensor * qrot = ggml_view_3d(
5557
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5758
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5558
5759
  ggml_element_size(tmpq) * n_embd_head,
5559
5760
  ggml_element_size(tmpq) * n_embd_head * n_head,
5560
5761
  0
@@ -5562,7 +5763,7 @@ struct llm_build_context {
5562
5763
  cb(qrot, "qrot", il);
5563
5764
 
5564
5765
  struct ggml_tensor * krot = ggml_view_3d(
5565
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5766
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5566
5767
  ggml_element_size(tmpk) * n_embd_head,
5567
5768
  ggml_element_size(tmpk) * n_embd_head * n_head,
5568
5769
  0
@@ -5571,29 +5772,29 @@ struct llm_build_context {
5571
5772
 
5572
5773
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5573
5774
  struct ggml_tensor * qpass = ggml_view_3d(
5574
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5775
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5575
5776
  ggml_element_size(tmpq) * n_embd_head,
5576
5777
  ggml_element_size(tmpq) * n_embd_head * n_head,
5577
- ggml_element_size(tmpq) * hparams.n_rot
5778
+ ggml_element_size(tmpq) * n_rot
5578
5779
  );
5579
5780
  cb(qpass, "qpass", il);
5580
5781
 
5581
5782
  struct ggml_tensor * kpass = ggml_view_3d(
5582
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5783
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5583
5784
  ggml_element_size(tmpk) * n_embd_head,
5584
5785
  ggml_element_size(tmpk) * n_embd_head * n_head,
5585
- ggml_element_size(tmpk) * hparams.n_rot
5786
+ ggml_element_size(tmpk) * n_rot
5586
5787
  );
5587
5788
  cb(kpass, "kpass", il);
5588
5789
 
5589
5790
  struct ggml_tensor * qrotated = ggml_rope_custom(
5590
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5791
+ ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5591
5792
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5592
5793
  );
5593
5794
  cb(qrotated, "qrotated", il);
5594
5795
 
5595
5796
  struct ggml_tensor * krotated = ggml_rope_custom(
5596
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5797
+ ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5597
5798
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5598
5799
  );
5599
5800
  cb(krotated, "krotated", il);
@@ -5634,7 +5835,7 @@ struct llm_build_context {
5634
5835
 
5635
5836
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5636
5837
  model.layers[il].wo, model.layers[il].bo,
5637
- Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5838
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5638
5839
  cb(cur, "kqv_out", il);
5639
5840
  }
5640
5841
 
@@ -5696,6 +5897,10 @@ struct llm_build_context {
5696
5897
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5697
5898
  cb(KQ_mask, "KQ_mask", -1);
5698
5899
 
5900
+ // positions of the tokens in the KV cache
5901
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5902
+ cb(KQ_pos, "KQ_pos", -1);
5903
+
5699
5904
  for (int il = 0; il < n_layer; ++il) {
5700
5905
  struct ggml_tensor * inpSA = inpL;
5701
5906
 
@@ -5723,7 +5928,7 @@ struct llm_build_context {
5723
5928
 
5724
5929
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5725
5930
  model.layers[il].wo, NULL,
5726
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5931
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5727
5932
  cb(cur, "kqv_out", il);
5728
5933
  }
5729
5934
 
@@ -5773,6 +5978,7 @@ struct llm_build_context {
5773
5978
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5774
5979
 
5775
5980
  const int64_t n_embd_head = hparams.n_embd_head_v;
5981
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5776
5982
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5777
5983
 
5778
5984
  struct ggml_tensor * cur;
@@ -5781,7 +5987,8 @@ struct llm_build_context {
5781
5987
  // get input vectors with right size
5782
5988
  const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5783
5989
  struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5784
- struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5990
+ struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5991
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5785
5992
 
5786
5993
  // construct input embeddings (token, type, position)
5787
5994
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5789,7 +5996,9 @@ struct llm_build_context {
5789
5996
  // token types are hardcoded to zero ("Sentence A")
5790
5997
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5791
5998
  inpL = ggml_add(ctx0, inpL, type_row0);
5792
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5999
+ if (model.arch == LLM_ARCH_BERT) {
6000
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
6001
+ }
5793
6002
  cb(inpL, "inp_embd", -1);
5794
6003
 
5795
6004
  // embed layer norm
@@ -5805,7 +6014,7 @@ struct llm_build_context {
5805
6014
  struct ggml_tensor * cur = inpL;
5806
6015
 
5807
6016
  // self-attention
5808
- {
6017
+ if (model.arch == LLM_ARCH_BERT) {
5809
6018
  struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5810
6019
  cb(Qcur, "Qcur", il);
5811
6020
 
@@ -5820,7 +6029,38 @@ struct llm_build_context {
5820
6029
 
5821
6030
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5822
6031
  model.layers[il].wo, model.layers[il].bo,
5823
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6032
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6033
+ cb(cur, "kqv_out", il);
6034
+ } else {
6035
+ // compute Q and K and RoPE them
6036
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6037
+ cb(cur, "wqkv", il);
6038
+
6039
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6040
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6041
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6042
+
6043
+ cb(Qcur, "Qcur", il);
6044
+ cb(Kcur, "Kcur", il);
6045
+ cb(Vcur, "Vcur", il);
6046
+
6047
+ Qcur = ggml_rope_custom(
6048
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6049
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6050
+ ext_factor, attn_factor, beta_fast, beta_slow
6051
+ );
6052
+ cb(Qcur, "Qcur", il);
6053
+
6054
+ Kcur = ggml_rope_custom(
6055
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6056
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6057
+ ext_factor, attn_factor, beta_fast, beta_slow
6058
+ );
6059
+ cb(Kcur, "Kcur", il);
6060
+
6061
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6062
+ model.layers[il].wo, model.layers[il].bo,
6063
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5824
6064
  cb(cur, "kqv_out", il);
5825
6065
  }
5826
6066
 
@@ -5828,25 +6068,34 @@ struct llm_build_context {
5828
6068
  cur = ggml_add(ctx0, cur, inpL);
5829
6069
 
5830
6070
  // attention layer norm
5831
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
6071
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
5832
6072
 
5833
6073
  struct ggml_tensor * ffn_inp = cur;
5834
6074
  cb(ffn_inp, "ffn_inp", il);
5835
6075
 
5836
6076
  // feed-forward network
5837
- cur = llm_build_ffn(ctx0, cur,
5838
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5839
- NULL, NULL,
5840
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5841
- NULL,
5842
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6077
+ if (model.arch == LLM_ARCH_BERT) {
6078
+ cur = llm_build_ffn(ctx0, cur,
6079
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6080
+ NULL, NULL,
6081
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6082
+ NULL,
6083
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6084
+ } else {
6085
+ cur = llm_build_ffn(ctx0, cur,
6086
+ model.layers[il].ffn_up, NULL,
6087
+ model.layers[il].ffn_gate, NULL,
6088
+ model.layers[il].ffn_down, NULL,
6089
+ NULL,
6090
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6091
+ }
5843
6092
  cb(cur, "ffn_out", il);
5844
6093
 
5845
6094
  // attentions bypass the intermediate layer
5846
6095
  cur = ggml_add(ctx0, cur, ffn_inp);
5847
6096
 
5848
6097
  // output layer norm
5849
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
6098
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
5850
6099
 
5851
6100
  // input for next layer
5852
6101
  inpL = cur;
@@ -5856,8 +6105,12 @@ struct llm_build_context {
5856
6105
  cur = inpL;
5857
6106
 
5858
6107
  // pooling layer
5859
- if (do_pooling) {
5860
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
6108
+ if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6109
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6110
+ } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6111
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6112
+ } else {
6113
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
5861
6114
  }
5862
6115
  cb(cur, "result_embd", -1);
5863
6116
 
@@ -5883,6 +6136,10 @@ struct llm_build_context {
5883
6136
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5884
6137
  cb(KQ_mask, "KQ_mask", -1);
5885
6138
 
6139
+ // positions of the tokens in the KV cache
6140
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6141
+ cb(KQ_pos, "KQ_pos", -1);
6142
+
5886
6143
  inpL = llm_build_norm(ctx0, inpL, hparams,
5887
6144
  model.tok_norm,
5888
6145
  model.tok_norm_b,
@@ -5916,7 +6173,7 @@ struct llm_build_context {
5916
6173
 
5917
6174
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5918
6175
  model.layers[il].wo, model.layers[il].bo,
5919
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6176
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5920
6177
  cb(cur, "kqv_out", il);
5921
6178
  }
5922
6179
 
@@ -5976,12 +6233,16 @@ struct llm_build_context {
5976
6233
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5977
6234
  cb(KQ_mask, "KQ_mask", -1);
5978
6235
 
6236
+ // positions of the tokens in the KV cache
6237
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6238
+ cb(KQ_pos, "KQ_pos", -1);
6239
+
5979
6240
  for (int il = 0; il < n_layer; ++il) {
5980
6241
  struct ggml_tensor * attn_norm;
5981
6242
 
5982
6243
  attn_norm = llm_build_norm(ctx0, inpL, hparams,
5983
6244
  model.layers[il].attn_norm,
5984
- NULL,
6245
+ model.layers[il].attn_norm_b,
5985
6246
  LLM_NORM, cb, il);
5986
6247
  cb(attn_norm, "attn_norm", il);
5987
6248
 
@@ -5992,6 +6253,11 @@ struct llm_build_context {
5992
6253
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5993
6254
  cb(cur, "wqkv", il);
5994
6255
 
6256
+ if (model.layers[il].bqkv){
6257
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6258
+ cb(cur, "bqkv", il);
6259
+ }
6260
+
5995
6261
  if (hparams.f_clamp_kqv > 0.0f) {
5996
6262
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5997
6263
  cb(cur, "wqkv_clamped", il);
@@ -6008,8 +6274,8 @@ struct llm_build_context {
6008
6274
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6009
6275
 
6010
6276
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6011
- model.layers[il].wo, NULL,
6012
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6277
+ model.layers[il].wo, model.layers[il].bo,
6278
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6013
6279
  cb(cur, "kqv_out", il);
6014
6280
  }
6015
6281
 
@@ -6021,13 +6287,13 @@ struct llm_build_context {
6021
6287
  {
6022
6288
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
6023
6289
  model.layers[il].ffn_norm,
6024
- NULL,
6290
+ model.layers[il].ffn_norm_b,
6025
6291
  LLM_NORM, cb, il);
6026
6292
  cb(cur, "ffn_norm", il);
6027
6293
  cur = llm_build_ffn(ctx0, cur,
6028
- model.layers[il].ffn_up, NULL,
6294
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6029
6295
  NULL, NULL,
6030
- model.layers[il].ffn_down, NULL,
6296
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6031
6297
  model.layers[il].ffn_act,
6032
6298
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6033
6299
  cb(cur, "ffn_out", il);
@@ -6044,7 +6310,7 @@ struct llm_build_context {
6044
6310
 
6045
6311
  cur = llm_build_norm(ctx0, cur, hparams,
6046
6312
  model.output_norm,
6047
- NULL,
6313
+ model.output_norm_b,
6048
6314
  LLM_NORM, cb, -1);
6049
6315
  cb(cur, "result_norm", -1);
6050
6316
 
@@ -6076,11 +6342,6 @@ struct llm_build_context {
6076
6342
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6077
6343
  cb(KQ_mask, "KQ_mask", -1);
6078
6344
 
6079
- // shift the entire K-cache if needed
6080
- if (do_rope_shift) {
6081
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6082
- }
6083
-
6084
6345
  for (int il = 0; il < n_layer; ++il) {
6085
6346
  struct ggml_tensor * inpSA = inpL;
6086
6347
 
@@ -6117,21 +6378,21 @@ struct llm_build_context {
6117
6378
 
6118
6379
  Qcur = ggml_rope_custom(
6119
6380
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6120
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6381
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6121
6382
  ext_factor, attn_factor, beta_fast, beta_slow
6122
6383
  );
6123
6384
  cb(Qcur, "Qcur", il);
6124
6385
 
6125
6386
  Kcur = ggml_rope_custom(
6126
6387
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6127
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6388
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6128
6389
  ext_factor, attn_factor, beta_fast, beta_slow
6129
6390
  );
6130
6391
  cb(Kcur, "Kcur", il);
6131
6392
 
6132
6393
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6133
6394
  model.layers[il].wo, NULL,
6134
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6395
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6135
6396
  cb(cur, "kqv_out", il);
6136
6397
  }
6137
6398
 
@@ -6199,11 +6460,6 @@ struct llm_build_context {
6199
6460
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6200
6461
  cb(KQ_mask, "KQ_mask", -1);
6201
6462
 
6202
- // shift the entire K-cache if needed
6203
- if (do_rope_shift) {
6204
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6205
- }
6206
-
6207
6463
  for (int il = 0; il < n_layer; ++il) {
6208
6464
  struct ggml_tensor * inpSA = inpL;
6209
6465
 
@@ -6233,20 +6489,20 @@ struct llm_build_context {
6233
6489
 
6234
6490
  // using mode = 2 for neox mode
6235
6491
  Qcur = ggml_rope_custom(
6236
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6492
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6237
6493
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6238
6494
  );
6239
6495
  cb(Qcur, "Qcur", il);
6240
6496
 
6241
6497
  Kcur = ggml_rope_custom(
6242
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6498
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6243
6499
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6244
6500
  );
6245
6501
  cb(Kcur, "Kcur", il);
6246
6502
 
6247
6503
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6248
6504
  model.layers[il].wo, NULL,
6249
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6505
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6250
6506
  cb(cur, "kqv_out", il);
6251
6507
  }
6252
6508
 
@@ -6313,11 +6569,6 @@ struct llm_build_context {
6313
6569
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6314
6570
  cb(KQ_mask, "KQ_mask", -1);
6315
6571
 
6316
- // shift the entire K-cache if needed
6317
- if (do_rope_shift) {
6318
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6319
- }
6320
-
6321
6572
  for (int il = 0; il < n_layer; ++il) {
6322
6573
  struct ggml_tensor * inpSA = inpL;
6323
6574
 
@@ -6353,21 +6604,21 @@ struct llm_build_context {
6353
6604
 
6354
6605
  Qcur = ggml_rope_custom(
6355
6606
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6356
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6607
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6357
6608
  ext_factor, attn_factor, beta_fast, beta_slow
6358
6609
  );
6359
6610
  cb(Qcur, "Qcur", il);
6360
6611
 
6361
6612
  Kcur = ggml_rope_custom(
6362
6613
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6363
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6614
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6364
6615
  ext_factor, attn_factor, beta_fast, beta_slow
6365
6616
  );
6366
6617
  cb(Kcur, "Kcur", il);
6367
6618
 
6368
6619
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6369
6620
  model.layers[il].wo, model.layers[il].bo,
6370
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6621
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6371
6622
  cb(cur, "kqv_out", il);
6372
6623
  }
6373
6624
 
@@ -6434,11 +6685,6 @@ struct llm_build_context {
6434
6685
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6435
6686
  cb(KQ_mask, "KQ_mask", -1);
6436
6687
 
6437
- // shift the entire K-cache if needed
6438
- if (do_rope_shift) {
6439
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6440
- }
6441
-
6442
6688
  for (int il = 0; il < n_layer; ++il) {
6443
6689
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
6444
6690
  model.layers[il].attn_norm,
@@ -6476,7 +6722,7 @@ struct llm_build_context {
6476
6722
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6477
6723
 
6478
6724
  Qcur = ggml_rope_custom(
6479
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6725
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6480
6726
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6481
6727
  );
6482
6728
  cb(Qcur, "Qcur", il);
@@ -6487,14 +6733,14 @@ struct llm_build_context {
6487
6733
  cb(Qcur, "Qcur", il);
6488
6734
 
6489
6735
  Kcur = ggml_rope_custom(
6490
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6736
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6491
6737
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6492
6738
  );
6493
6739
  cb(Kcur, "Kcur", il);
6494
6740
 
6495
6741
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6496
6742
  model.layers[il].wo, model.layers[il].bo,
6497
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
6743
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6498
6744
  cb(cur, "kqv_out", il);
6499
6745
  }
6500
6746
 
@@ -6556,11 +6802,6 @@ struct llm_build_context {
6556
6802
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6557
6803
  cb(KQ_mask, "KQ_mask", -1);
6558
6804
 
6559
- // shift the entire K-cache if needed
6560
- if (do_rope_shift) {
6561
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6562
- }
6563
-
6564
6805
  for (int il = 0; il < n_layer; ++il) {
6565
6806
 
6566
6807
  // norm
@@ -6584,20 +6825,20 @@ struct llm_build_context {
6584
6825
  cb(Vcur, "Vcur", il);
6585
6826
 
6586
6827
  Qcur = ggml_rope_custom(
6587
- ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
6588
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6828
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
6829
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6589
6830
  ext_factor, attn_factor, beta_fast, beta_slow);
6590
6831
  cb(Qcur, "Qcur", il);
6591
6832
 
6592
6833
  Kcur = ggml_rope_custom(
6593
- ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
6594
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6834
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
6835
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6595
6836
  ext_factor, attn_factor, beta_fast, beta_slow);
6596
6837
  cb(Kcur, "Kcur", il);
6597
6838
 
6598
6839
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6599
6840
  model.layers[il].wo, NULL,
6600
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6841
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6601
6842
  cb(cur, "kqv_out", il);
6602
6843
  }
6603
6844
  struct ggml_tensor * sa_out = cur;
@@ -6696,7 +6937,7 @@ struct llm_build_context {
6696
6937
 
6697
6938
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6698
6939
  model.layers[il].wo, model.layers[il].bo,
6699
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6940
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6700
6941
  cb(cur, "kqv_out", il);
6701
6942
  }
6702
6943
 
@@ -6761,11 +7002,6 @@ struct llm_build_context {
6761
7002
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6762
7003
  cb(KQ_mask, "KQ_mask", -1);
6763
7004
 
6764
- // shift the entire K-cache if needed
6765
- if (do_rope_shift) {
6766
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6767
- }
6768
-
6769
7005
  for (int il = 0; il < n_layer; ++il) {
6770
7006
  cur = llm_build_norm(ctx0, inpL, hparams,
6771
7007
  model.layers[il].attn_norm,
@@ -6791,21 +7027,21 @@ struct llm_build_context {
6791
7027
 
6792
7028
  struct ggml_tensor * Qcur = ggml_rope_custom(
6793
7029
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6794
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7030
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6795
7031
  ext_factor, attn_factor, beta_fast, beta_slow
6796
7032
  );
6797
7033
  cb(Qcur, "Qcur", il);
6798
7034
 
6799
7035
  struct ggml_tensor * Kcur = ggml_rope_custom(
6800
7036
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6801
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7037
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6802
7038
  ext_factor, attn_factor, beta_fast, beta_slow
6803
7039
  );
6804
7040
  cb(Kcur, "Kcur", il);
6805
7041
 
6806
7042
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6807
7043
  model.layers[il].wo, model.layers[il].bo,
6808
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7044
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6809
7045
  cb(cur, "kqv_out", il);
6810
7046
  }
6811
7047
 
@@ -6869,11 +7105,6 @@ struct llm_build_context {
6869
7105
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6870
7106
  cb(KQ_mask, "KQ_mask", -1);
6871
7107
 
6872
- // shift the entire K-cache if needed
6873
- if (do_rope_shift) {
6874
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6875
- }
6876
-
6877
7108
  for (int il = 0; il < n_layer; ++il) {
6878
7109
  struct ggml_tensor * inpSA = inpL;
6879
7110
 
@@ -6909,21 +7140,21 @@ struct llm_build_context {
6909
7140
 
6910
7141
  Qcur = ggml_rope_custom(
6911
7142
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6912
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7143
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6913
7144
  ext_factor, attn_factor, beta_fast, beta_slow
6914
7145
  );
6915
7146
  cb(Qcur, "Qcur", il);
6916
7147
 
6917
7148
  Kcur = ggml_rope_custom(
6918
7149
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6919
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7150
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6920
7151
  ext_factor, attn_factor, beta_fast, beta_slow
6921
7152
  );
6922
7153
  cb(Kcur, "Kcur", il);
6923
7154
 
6924
7155
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6925
7156
  model.layers[il].wo, NULL,
6926
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7157
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6927
7158
  cb(cur, "kqv_out", il);
6928
7159
  }
6929
7160
 
@@ -6988,11 +7219,6 @@ struct llm_build_context {
6988
7219
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6989
7220
  cb(KQ_mask, "KQ_mask", -1);
6990
7221
 
6991
- // shift the entire K-cache if needed
6992
- if (do_rope_shift) {
6993
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6994
- }
6995
-
6996
7222
  for (int il = 0; il < n_layer; ++il) {
6997
7223
  struct ggml_tensor * inpSA = inpL;
6998
7224
 
@@ -7028,21 +7254,21 @@ struct llm_build_context {
7028
7254
 
7029
7255
  Qcur = ggml_rope_custom(
7030
7256
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7031
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7257
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7032
7258
  ext_factor, attn_factor, beta_fast, beta_slow
7033
7259
  );
7034
7260
  cb(Qcur, "Qcur", il);
7035
7261
 
7036
7262
  Kcur = ggml_rope_custom(
7037
7263
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7038
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7264
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7039
7265
  ext_factor, attn_factor, beta_fast, beta_slow
7040
7266
  );
7041
7267
  cb(Kcur, "Kcur", il);
7042
7268
 
7043
7269
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7044
7270
  model.layers[il].wo, model.layers[il].bo,
7045
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7271
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7046
7272
  cb(cur, "kqv_out", il);
7047
7273
  }
7048
7274
 
@@ -7120,11 +7346,6 @@ struct llm_build_context {
7120
7346
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7121
7347
  cb(KQ_mask, "KQ_mask", -1);
7122
7348
 
7123
- // shift the entire K-cache if needed
7124
- if (do_rope_shift) {
7125
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7126
- }
7127
-
7128
7349
  for (int il = 0; il < n_layer; ++il) {
7129
7350
  struct ggml_tensor * inpSA = inpL;
7130
7351
 
@@ -7160,21 +7381,21 @@ struct llm_build_context {
7160
7381
 
7161
7382
  Qcur = ggml_rope_custom(
7162
7383
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7163
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7384
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7164
7385
  ext_factor, attn_factor, beta_fast, beta_slow
7165
7386
  );
7166
7387
  cb(Qcur, "Qcur", il);
7167
7388
 
7168
7389
  Kcur = ggml_rope_custom(
7169
7390
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7170
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7391
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7171
7392
  ext_factor, attn_factor, beta_fast, beta_slow
7172
7393
  );
7173
7394
  cb(Kcur, "Kcur", il);
7174
7395
 
7175
7396
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7176
7397
  model.layers[il].wo, model.layers[il].bo,
7177
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7398
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7178
7399
  cb(cur, "kqv_out", il);
7179
7400
  }
7180
7401
 
@@ -7233,8 +7454,147 @@ struct llm_build_context {
7233
7454
 
7234
7455
  return gf;
7235
7456
  }
7457
+
7458
+ struct ggml_cgraph * build_gemma() {
7459
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7460
+
7461
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
7462
+
7463
+ struct ggml_tensor * cur;
7464
+ struct ggml_tensor * inpL;
7465
+
7466
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7467
+ cb(inpL, "inp_embd", -1);
7468
+
7469
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7470
+ cb(inpL, "inp_scaled", -1);
7471
+
7472
+ // inp_pos - contains the positions
7473
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7474
+ cb(inp_pos, "inp_pos", -1);
7475
+
7476
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7477
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7478
+ cb(KQ_mask, "KQ_mask", -1);
7479
+
7480
+ for (int il = 0; il < n_layer; ++il) {
7481
+
7482
+ // norm
7483
+ cur = llm_build_norm(ctx0, inpL, hparams,
7484
+ model.layers[il].attn_norm, NULL,
7485
+ LLM_NORM_RMS, cb, il);
7486
+ cb(cur, "attn_norm", il);
7487
+
7488
+ // self-attention
7489
+ {
7490
+ // compute Q and K and RoPE them
7491
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7492
+ cb(Qcur, "Qcur", il);
7493
+
7494
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7495
+ cb(Kcur, "Kcur", il);
7496
+
7497
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7498
+ cb(Vcur, "Vcur", il);
7499
+
7500
+ Qcur = ggml_rope_custom(
7501
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7502
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7503
+ ext_factor, attn_factor, beta_fast, beta_slow);
7504
+ cb(Qcur, "Qcur", il);
7505
+
7506
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
7507
+ cb(Qcur, "Qcur_scaled", il);
7508
+
7509
+ Kcur = ggml_rope_custom(
7510
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7511
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7512
+ ext_factor, attn_factor, beta_fast, beta_slow);
7513
+ cb(Kcur, "Kcur", il);
7514
+
7515
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7516
+ model.layers[il].wo, NULL,
7517
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7518
+ cb(cur, "kqv_out", il);
7519
+ }
7520
+
7521
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
7522
+ cb(sa_out, "sa_out", il);
7523
+
7524
+ cur = llm_build_norm(ctx0, sa_out, hparams,
7525
+ model.layers[il].ffn_norm, NULL,
7526
+ LLM_NORM_RMS, cb, il);
7527
+ cb(cur, "ffn_norm", il);
7528
+
7529
+ // feed-forward network
7530
+ {
7531
+ cur = llm_build_ffn(ctx0, cur,
7532
+ model.layers[il].ffn_up, NULL,
7533
+ model.layers[il].ffn_gate, NULL,
7534
+ model.layers[il].ffn_down, NULL,
7535
+ NULL,
7536
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
7537
+ cb(cur, "ffn_out", il);
7538
+ }
7539
+
7540
+ cur = ggml_add(ctx0, cur, sa_out);
7541
+ cb(cur, "l_out", il);
7542
+
7543
+ // input for next layer
7544
+ inpL = cur;
7545
+ }
7546
+
7547
+ cur = inpL;
7548
+
7549
+ cur = llm_build_norm(ctx0, cur, hparams,
7550
+ model.output_norm, NULL,
7551
+ LLM_NORM_RMS, cb, -1);
7552
+ cb(cur, "result_norm", -1);
7553
+
7554
+ // lm_head
7555
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7556
+ cb(cur, "result_output", -1);
7557
+
7558
+ ggml_build_forward_expand(gf, cur);
7559
+
7560
+ return gf;
7561
+ }
7236
7562
  };
7237
7563
 
7564
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7565
+ llama_batch dummy;
7566
+ dummy.n_tokens = 0;
7567
+
7568
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7569
+
7570
+ struct llm_build_context llm(lctx, dummy, cb, false);
7571
+
7572
+ llm.init();
7573
+
7574
+ struct ggml_cgraph * result = llm.build_defrag(ids);
7575
+
7576
+ llm.free();
7577
+
7578
+ return result;
7579
+ }
7580
+
7581
+ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7582
+ llama_batch dummy;
7583
+ dummy.n_tokens = 0;
7584
+
7585
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7586
+
7587
+ struct llm_build_context llm(lctx, dummy, cb, false);
7588
+
7589
+ llm.init();
7590
+
7591
+ struct ggml_cgraph * result = llm.build_k_shift();
7592
+
7593
+ llm.free();
7594
+
7595
+ return result;
7596
+ }
7597
+
7238
7598
  static struct ggml_cgraph * llama_build_graph(
7239
7599
  llama_context & lctx,
7240
7600
  const llama_batch & batch,
@@ -7289,6 +7649,7 @@ static struct ggml_cgraph * llama_build_graph(
7289
7649
  result = llm.build_refact();
7290
7650
  } break;
7291
7651
  case LLM_ARCH_BERT:
7652
+ case LLM_ARCH_NOMIC_BERT:
7292
7653
  {
7293
7654
  result = llm.build_bert();
7294
7655
  } break;
@@ -7340,6 +7701,10 @@ static struct ggml_cgraph * llama_build_graph(
7340
7701
  {
7341
7702
  result = llm.build_minicpm();
7342
7703
  } break;
7704
+ case LLM_ARCH_GEMMA:
7705
+ {
7706
+ result = llm.build_gemma();
7707
+ } break;
7343
7708
  default:
7344
7709
  GGML_ASSERT(false);
7345
7710
  }
@@ -7349,6 +7714,20 @@ static struct ggml_cgraph * llama_build_graph(
7349
7714
  return result;
7350
7715
  }
7351
7716
 
7717
+ static void llama_set_k_shift(llama_context & lctx) {
7718
+ const auto & cparams = lctx.cparams;
7719
+
7720
+ const int64_t n_ctx = cparams.n_ctx;
7721
+
7722
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7723
+
7724
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7725
+
7726
+ for (int i = 0; i < n_ctx; ++i) {
7727
+ data[i] = lctx.kv_self.cells[i].delta;
7728
+ }
7729
+ }
7730
+
7352
7731
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7353
7732
  //
7354
7733
  // set input data
@@ -7404,42 +7783,90 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7404
7783
  }
7405
7784
  }
7406
7785
 
7407
- {
7408
- assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7409
- float * data = (float *) lctx.inp_sum->data;
7786
+ if (hparams.need_kq_pos) {
7787
+ const int64_t n_kv = kv_self.n;
7788
+
7789
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7790
+
7791
+ float * data = (float *) lctx.inp_KQ_pos->data;
7410
7792
 
7411
- for (int i = 0; i < batch.n_tokens; ++i) {
7412
- data[i] = 1.0f/float(batch.n_tokens);
7793
+ for (int i = 0; i < n_kv; ++i) {
7794
+ data[i] = float(lctx.kv_self.cells[i].pos);
7413
7795
  }
7414
7796
  }
7415
7797
 
7416
- if (kv_self.has_shift) {
7417
- const int64_t n_ctx = cparams.n_ctx;
7798
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7799
+ const int64_t n_tokens = batch.n_tokens;
7800
+
7801
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7802
+ float * data = (float *) lctx.inp_mean->data;
7803
+
7804
+ memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7418
7805
 
7419
- assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7806
+ std::vector<uint64_t> sum(n_tokens, 0);
7807
+ for (int i = 0; i < n_tokens; ++i) {
7808
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7809
+ sum[seq_id] += 1;
7810
+ }
7420
7811
 
7421
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7812
+ std::vector<float> div(n_tokens, 0.0f);
7813
+ for (int i = 0; i < n_tokens; ++i) {
7814
+ const uint64_t s = sum[i];
7815
+ if (s > 0) {
7816
+ div[i] = 1.0f/float(s);
7817
+ }
7818
+ }
7422
7819
 
7423
- for (int i = 0; i < n_ctx; ++i) {
7424
- data[i] = lctx.kv_self.cells[i].delta;
7820
+ for (int i = 0; i < n_tokens; ++i) {
7821
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7822
+ data[seq_id*n_tokens + i] = div[seq_id];
7425
7823
  }
7426
7824
  }
7427
7825
 
7428
- if (hparams.pooling_layer && cparams.do_pooling) {
7826
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7429
7827
  const int64_t n_tokens = batch.n_tokens;
7430
7828
 
7431
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7432
- float * data = (float *) lctx.inp_sum->data;
7433
-
7434
- memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7829
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
7830
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
7435
7831
 
7436
7832
  for (int i = 0; i < n_tokens; ++i) {
7437
7833
  const llama_seq_id seq_id = batch.seq_id[i][0];
7438
- data[seq_id*n_tokens + i] = 1.0f;
7834
+ const llama_pos pos = batch.pos[i];
7835
+ if (pos == 0) {
7836
+ data[seq_id] = i;
7837
+ }
7439
7838
  }
7440
7839
  }
7441
7840
  }
7442
7841
 
7842
+ static void llama_graph_compute(
7843
+ llama_context & lctx,
7844
+ ggml_cgraph * gf,
7845
+ int n_threads) {
7846
+ #ifdef GGML_USE_MPI
7847
+ const int64_t n_layer = lctx.model.hparams.n_layer;
7848
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7849
+ #endif
7850
+
7851
+ #ifdef GGML_USE_METAL
7852
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
7853
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7854
+ }
7855
+ #endif
7856
+
7857
+ if (lctx.backend_cpu != nullptr) {
7858
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7859
+ }
7860
+
7861
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
7862
+
7863
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7864
+
7865
+ #ifdef GGML_USE_MPI
7866
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7867
+ #endif
7868
+ }
7869
+
7443
7870
  // decode a batch of tokens by evaluating the transformer
7444
7871
  //
7445
7872
  // - lctx: llama context
@@ -7466,9 +7893,9 @@ static int llama_decode_internal(
7466
7893
  const auto n_batch = cparams.n_batch;
7467
7894
 
7468
7895
  GGML_ASSERT(n_tokens <= n_batch);
7896
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7469
7897
 
7470
7898
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
7471
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7472
7899
 
7473
7900
  const int64_t t_start_us = ggml_time_us();
7474
7901
 
@@ -7517,6 +7944,8 @@ static int llama_decode_internal(
7517
7944
  batch.seq_id = seq_id_arr.data();
7518
7945
  }
7519
7946
 
7947
+ llama_kv_cache_update(&lctx);
7948
+
7520
7949
  // if we have enough unused cells before the current head ->
7521
7950
  // better to start searching from the beginning of the cache, hoping to fill it
7522
7951
  if (kv_self.head > kv_self.used + 2*n_tokens) {
@@ -7541,8 +7970,9 @@ static int llama_decode_internal(
7541
7970
  ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7542
7971
 
7543
7972
  // the output is always the last tensor in the graph
7544
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7973
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7545
7974
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7975
+
7546
7976
  if (strcmp(res->name, "result_output") == 0) {
7547
7977
  // the embeddings could be the second to last tensor, or the third to last tensor
7548
7978
  if (strcmp(embeddings->name, "result_norm") != 0) {
@@ -7569,40 +7999,12 @@ static int llama_decode_internal(
7569
7999
  n_threads = std::min(4, n_threads);
7570
8000
  }
7571
8001
 
7572
- #ifdef GGML_USE_MPI
7573
- const int64_t n_layer = hparams.n_layer;
7574
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7575
- #endif
7576
-
7577
- #ifdef GGML_USE_METAL
7578
- if (ggml_backend_is_metal(lctx.backend_metal)) {
7579
- ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7580
- }
7581
- #endif
7582
-
7583
- if (lctx.backend_cpu != nullptr) {
7584
- ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7585
- }
7586
-
7587
8002
  llama_set_inputs(lctx, batch);
7588
8003
 
7589
- ggml_backend_sched_graph_compute(lctx.sched, gf);
7590
-
7591
- // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7592
-
7593
- #ifdef GGML_USE_MPI
7594
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7595
- #endif
8004
+ llama_graph_compute(lctx, gf, n_threads);
7596
8005
 
7597
8006
  // update the kv ring buffer
7598
8007
  {
7599
- if (kv_self.has_shift) {
7600
- kv_self.has_shift = false;
7601
- for (uint32_t i = 0; i < kv_self.size; ++i) {
7602
- kv_self.cells[i].delta = 0;
7603
- }
7604
- }
7605
-
7606
8008
  kv_self.head += n_tokens;
7607
8009
 
7608
8010
  // Ensure kv cache head points to a valid index.
@@ -7611,91 +8013,342 @@ static int llama_decode_internal(
7611
8013
  }
7612
8014
  }
7613
8015
 
8016
+ // decide if we need to defrag the kv cache
8017
+ if (cparams.defrag_thold >= 0.0f) {
8018
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8019
+
8020
+ // queue defragmentation for next llama_kv_cache_update
8021
+ if (fragmentation > cparams.defrag_thold) {
8022
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8023
+
8024
+ llama_kv_cache_defrag(kv_self);
8025
+ }
8026
+ }
8027
+
7614
8028
  #ifdef GGML_PERF
7615
8029
  // print timing information per ggml operation (for debugging purposes)
7616
8030
  // requires GGML_PERF to be defined
7617
8031
  ggml_graph_print(gf);
7618
8032
  #endif
7619
8033
 
7620
- // plot the computation graph in dot format (for debugging purposes)
7621
- //if (n_past%100 == 0) {
7622
- // ggml_graph_dump_dot(gf, NULL, "llama.dot");
7623
- //}
8034
+ // plot the computation graph in dot format (for debugging purposes)
8035
+ //if (n_past%100 == 0) {
8036
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
8037
+ //}
8038
+
8039
+ // extract logits
8040
+ // TODO: do not compute and extract logits if only embeddings are needed
8041
+ // need to update the graphs to skip "result_output"
8042
+ if (res) {
8043
+ auto & logits_out = lctx.logits;
8044
+
8045
+ #ifndef NDEBUG
8046
+ auto & logits_valid = lctx.logits_valid;
8047
+ logits_valid.clear();
8048
+ logits_valid.resize(n_tokens);
8049
+
8050
+ logits_out.clear();
8051
+ #endif
8052
+
8053
+ ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8054
+ GGML_ASSERT(res_backend != nullptr);
8055
+ if (batch.logits) {
8056
+ logits_out.resize(n_vocab * n_tokens);
8057
+ for (uint32_t i = 0; i < n_tokens; i++) {
8058
+ if (batch.logits[i] == 0) {
8059
+ continue;
8060
+ }
8061
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8062
+ #ifndef NDEBUG
8063
+ logits_valid[i] = true;
8064
+ #endif
8065
+ }
8066
+ } else if (lctx.logits_all) {
8067
+ logits_out.resize(n_vocab * n_tokens);
8068
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8069
+ #ifndef NDEBUG
8070
+ std::fill(logits_valid.begin(), logits_valid.end(), true);
8071
+ #endif
8072
+ } else {
8073
+ logits_out.resize(n_vocab);
8074
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8075
+ #ifndef NDEBUG
8076
+ logits_valid[0] = true;
8077
+ #endif
8078
+ }
8079
+ ggml_backend_synchronize(res_backend);
8080
+ }
8081
+
8082
+ // extract embeddings
8083
+ if (!lctx.embedding.empty()) {
8084
+ auto & embedding_out = lctx.embedding;
8085
+
8086
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8087
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8088
+
8089
+ embedding_out.resize(embd_size);
8090
+ ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8091
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8092
+ ggml_backend_synchronize(embeddings_backend);
8093
+ }
8094
+
8095
+ // measure the performance only for the single-token evals
8096
+ if (n_tokens == 1) {
8097
+ lctx.t_eval_us += ggml_time_us() - t_start_us;
8098
+ lctx.n_eval++;
8099
+ }
8100
+ else if (n_tokens > 1) {
8101
+ lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8102
+ lctx.n_p_eval += n_tokens;
8103
+ }
8104
+
8105
+ // get a more accurate load time, upon first eval
8106
+ // TODO: fix this
8107
+ if (!lctx.has_evaluated_once) {
8108
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8109
+ lctx.has_evaluated_once = true;
8110
+ }
8111
+
8112
+ return 0;
8113
+ }
8114
+
8115
+ // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8116
+ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8117
+ auto & kv_self = lctx.kv_self;
8118
+
8119
+ const auto & hparams = lctx.model.hparams;
8120
+
8121
+ const uint32_t n_layer = hparams.n_layer;
8122
+
8123
+ const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8124
+ const uint32_t n_used = kv_self.used;
8125
+
8126
+ assert(n_used <= n_kv);
8127
+
8128
+ //const int64_t t_start = ggml_time_us();
8129
+
8130
+ // number of cells moved
8131
+ uint32_t n_moves = 0;
8132
+
8133
+ // determine which KV cells to move where
8134
+ //
8135
+ // cell i moves to ids[i]
8136
+ //
8137
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
8138
+ //
8139
+ std::vector<uint32_t> ids(n_kv, n_kv);
8140
+
8141
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
8142
+ const auto & cell0 = kv_self.cells[i0];
8143
+
8144
+ if (!cell0.is_empty()) {
8145
+ ids[i0] = i0;
8146
+
8147
+ continue;
8148
+ }
8149
+
8150
+ // found a hole - fill it with data from the end of the cache
8151
+
8152
+ uint32_t nh = 1;
8153
+
8154
+ // determine the size of the hole
8155
+ while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
8156
+ nh++;
8157
+ }
8158
+
8159
+ // each move requires 6*n_layer tensors (see build_defrag)
8160
+ // - source view, destination view, copy operation
8161
+ // - x2 for keys and values
8162
+ //
8163
+ if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8164
+ // the graph is too big, we cannot move more cells
8165
+ break;
8166
+ }
8167
+
8168
+ uint32_t nf = 0;
8169
+ uint32_t is = n_kv - 1;
8170
+
8171
+ // starting from the end, find nh non-empty cells
8172
+ for (; is > i0; --is) {
8173
+ const auto & cell1 = kv_self.cells[is];
8174
+
8175
+ if (cell1.is_empty() || ids[is] != n_kv) {
8176
+ continue;
8177
+ }
8178
+
8179
+ // non-empty cell which is not yet moved
8180
+ nf++;
8181
+
8182
+ if (nf == nh) {
8183
+ break;
8184
+ }
8185
+ }
8186
+
8187
+ // this can only happen if `n_used` is not accurate, which would be a bug
8188
+ GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
8189
+
8190
+ nf = 0;
8191
+
8192
+ uint32_t i1 = is;
8193
+
8194
+ // are we moving a continuous block of memory?
8195
+ bool cont = false;
8196
+
8197
+ // go back and move the nf cells to the hole
8198
+ for (; i1 < n_kv; ++i1) {
8199
+ auto & cell1 = kv_self.cells[i1];
8200
+
8201
+ if (cell1.is_empty() || ids[i1] != n_kv) {
8202
+ cont = false;
8203
+ continue;
8204
+ }
8205
+
8206
+ // this cell goes to (i0 + nf)
8207
+ ids[i1] = i0 + nf;
8208
+
8209
+ // move the cell meta data
8210
+ kv_self.cells[i0 + nf] = cell1;
8211
+
8212
+ // clear the old cell and move the head there
8213
+ cell1 = llama_kv_cell();
8214
+ kv_self.head = n_used;
8215
+
8216
+ if (!cont) {
8217
+ n_moves++;
8218
+ cont = true;
8219
+ }
8220
+
8221
+ nf++;
8222
+
8223
+ if (nf == nh) {
8224
+ break;
8225
+ }
8226
+ }
8227
+
8228
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8229
+
8230
+ i0 += nh - 1;
8231
+ }
8232
+
8233
+ if (n_moves == 0) {
8234
+ return;
8235
+ }
8236
+
8237
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8238
+
8239
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
8240
+
8241
+ #if 0
8242
+ // CPU defrag
8243
+ //
8244
+ // TODO: optimizations are possible:
8245
+ // - multiple threads
8246
+ // - avoid copying to the host memory when already there
8247
+ //
8248
+ // likely not worth the effort, as we have ggml_graph based defrag
8249
+ //
8250
+
8251
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8252
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8253
+
8254
+ const uint32_t kv_size = kv_self.size;
8255
+
8256
+ std::vector<uint8_t> buf_k;
8257
+ std::vector<uint8_t> buf_v;
8258
+
8259
+ for (uint32_t il = 0; il < n_layer; ++il) {
8260
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
8261
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
8262
+
8263
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
8264
+ const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
8265
+
8266
+ buf_k.resize(k_size);
8267
+ buf_v.resize(v_size);
8268
+
8269
+ ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8270
+ ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8271
+
8272
+ // batch move [i, i+nm) to [id, id+nm)
8273
+ // note: cells can move only to a lower index
8274
+ for (uint32_t i = 0; i < n_kv; ++i) {
8275
+ const uint32_t id = ids[i];
8276
+
8277
+ if (i == id || id == n_kv) {
8278
+ continue;
8279
+ }
8280
+
8281
+ uint32_t nm = 1;
8282
+
8283
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
8284
+ nm++;
8285
+ }
8286
+
8287
+ // move keys
8288
+ {
8289
+ const int64_t os = i*k_size_row;
8290
+ const int64_t od = id*k_size_row;
8291
+
8292
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
8293
+ }
8294
+
8295
+ // move values (note: they are transposed)
8296
+ {
8297
+ const int64_t os = i;
8298
+ const int64_t od = id;
8299
+
8300
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
8301
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
8302
+ }
8303
+ }
8304
+
8305
+ i += nm - 1;
8306
+ }
8307
+
8308
+ ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8309
+ ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8310
+ }
8311
+ #else
8312
+ // ggml_graph defrag
8313
+
8314
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8315
+
8316
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8317
+ #endif
8318
+
8319
+ //const int64_t t_end = ggml_time_us();
7624
8320
 
7625
- // extract logits
7626
- // TODO: do not compute and extract logits if only embeddings are needed
7627
- // need to update the graphs to skip "result_output"
7628
- if (res) {
7629
- auto & logits_out = lctx.logits;
8321
+ //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8322
+ }
7630
8323
 
7631
- #ifndef NDEBUG
7632
- auto & logits_valid = lctx.logits_valid;
7633
- logits_valid.clear();
7634
- logits_valid.resize(n_tokens);
8324
+ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8325
+ // apply K-shift if needed
8326
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8327
+ llama_set_k_shift(lctx);
7635
8328
 
7636
- logits_out.clear();
7637
- #endif
8329
+ {
8330
+ ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
7638
8331
 
7639
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
7640
- GGML_ASSERT(res_backend != nullptr);
7641
- if (batch.logits) {
7642
- logits_out.resize(n_vocab * n_tokens);
7643
- for (uint32_t i = 0; i < n_tokens; i++) {
7644
- if (batch.logits[i] == 0) {
7645
- continue;
7646
- }
7647
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
7648
- #ifndef NDEBUG
7649
- logits_valid[i] = true;
7650
- #endif
7651
- }
7652
- } else if (lctx.logits_all) {
7653
- logits_out.resize(n_vocab * n_tokens);
7654
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
7655
- #ifndef NDEBUG
7656
- std::fill(logits_valid.begin(), logits_valid.end(), true);
7657
- #endif
7658
- } else {
7659
- logits_out.resize(n_vocab);
7660
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
7661
- #ifndef NDEBUG
7662
- logits_valid[0] = true;
7663
- #endif
8332
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
7664
8333
  }
7665
- ggml_backend_synchronize(res_backend);
7666
- }
7667
8334
 
7668
- // extract embeddings
7669
- if (!lctx.embedding.empty()) {
7670
- auto & embedding_out = lctx.embedding;
8335
+ {
8336
+ auto & kv_self = lctx.kv_self;
7671
8337
 
7672
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
7673
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8338
+ kv_self.has_shift = false;
7674
8339
 
7675
- embedding_out.resize(embd_size);
7676
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7677
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
7678
- ggml_backend_synchronize(embeddings_backend);
8340
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
8341
+ kv_self.cells[i].delta = 0;
8342
+ }
8343
+ }
7679
8344
  }
7680
8345
 
7681
- // measure the performance only for the single-token evals
7682
- if (n_tokens == 1) {
7683
- lctx.t_eval_us += ggml_time_us() - t_start_us;
7684
- lctx.n_eval++;
7685
- }
7686
- else if (n_tokens > 1) {
7687
- lctx.t_p_eval_us += ggml_time_us() - t_start_us;
7688
- lctx.n_p_eval += n_tokens;
7689
- }
8346
+ // defragment the KV cache if needed
8347
+ if (lctx.kv_self.do_defrag) {
8348
+ llama_kv_cache_defrag_internal(lctx);
7690
8349
 
7691
- // get a more accurate load time, upon first eval
7692
- // TODO: fix this
7693
- if (!lctx.has_evaluated_once) {
7694
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
7695
- lctx.has_evaluated_once = true;
8350
+ lctx.kv_self.do_defrag = false;
7696
8351
  }
7697
-
7698
- return 0;
7699
8352
  }
7700
8353
 
7701
8354
  //
@@ -8289,37 +8942,46 @@ struct llm_tokenizer_wpm {
8289
8942
  }
8290
8943
 
8291
8944
  std::vector<std::string> preprocess(const std::string & text) {
8292
- std::string ori_str = normalize(text);
8293
- uint64_t ori_size = ori_str.size();
8945
+ // normalalization form D
8946
+ std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
8947
+ std::vector<uint32_t> nfd_codepoints;
8948
+ for (uint32_t code : codepoints) {
8949
+ auto it = nfd_map.equal_range(code);
8950
+ if (it.first != it.second) {
8951
+ for (auto jt = it.first; jt != it.second; jt++) {
8952
+ nfd_codepoints.push_back(jt->second);
8953
+ }
8954
+ } else {
8955
+ nfd_codepoints.push_back(code);
8956
+ }
8957
+ }
8294
8958
 
8295
- // single punct / single symbol / single digit
8296
- // baseline: add whitespace on the left and right of punct and chinese characters
8297
- std::vector<std::string> words;
8959
+ // strip accents, strip control, uniformize whitespace,
8960
+ // to lowercase, pad chinese characters, pad punctuation
8298
8961
  std::string new_str = "";
8299
- uint64_t i = 0;
8300
- while (i < ori_size) {
8301
- int utf_char_len = utf8_len(ori_str[i]);
8302
- if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8303
- new_str += " ";
8304
- new_str += ori_str[i];
8305
- new_str += " ";
8306
- i += 1;
8962
+ for (uint32_t code : nfd_codepoints) {
8963
+ int type = codepoint_type(code);
8964
+ if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
8965
+ continue;
8966
+ }
8967
+ code = to_lower(code);
8968
+ if (type == CODEPOINT_TYPE_WHITESPACE) {
8969
+ code = ' ';
8307
8970
  }
8308
- else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8971
+ std::string s = codepoint_to_utf8(code);
8972
+ if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8309
8973
  new_str += " ";
8310
- new_str += ori_str.substr(i, 3);
8974
+ new_str += s;
8311
8975
  new_str += " ";
8312
- i += 3;
8313
- }
8314
- else {
8315
- new_str += ori_str[i];
8316
- i += 1;
8976
+ } else {
8977
+ new_str += s;
8317
8978
  }
8318
8979
  }
8319
8980
 
8320
8981
  // split by whitespace
8321
8982
  uint64_t l = 0;
8322
8983
  uint64_t r = 0;
8984
+ std::vector<std::string> words;
8323
8985
  while (r < new_str.size()) {
8324
8986
  // if is whitespace
8325
8987
  if (isspace(new_str[r])) {
@@ -8337,47 +8999,21 @@ struct llm_tokenizer_wpm {
8337
8999
  return words;
8338
9000
  }
8339
9001
 
8340
- std::string normalize(const std::string & text) {
8341
- // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8342
- std::string text2 = strip_accents(text);
8343
- for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8344
- char c = text2[i];
8345
- if (c >= 'A' && c <= 'Z') {
8346
- text2[i] = c - 'A' + 'a';
8347
- }
9002
+ uint32_t to_lower(uint32_t code) {
9003
+ static const std::locale locale("en_US.UTF-8");
9004
+ #if defined(_WIN32)
9005
+ if (code > 0xFFFF) {
9006
+ return code;
8348
9007
  }
8349
- return text2;
9008
+ #endif
9009
+ return std::tolower(wchar_t(code), locale);
8350
9010
  }
8351
9011
 
8352
- bool is_chinese_char(const std::string & str) {
8353
- int len = str.length();
8354
- unsigned int codepoint = 0;
8355
- int num_bytes = 0;
8356
- int i = 0;
8357
- unsigned char ch = static_cast<unsigned char>(str[i]);
8358
- if (ch <= 0x7f) {
8359
- codepoint = ch;
8360
- num_bytes = 1;
8361
- } else if ((ch >> 5) == 0x06) {
8362
- codepoint = ch & 0x1f;
8363
- num_bytes = 2;
8364
- } else if ((ch >> 4) == 0x0e) {
8365
- codepoint = ch & 0x0f;
8366
- num_bytes = 3;
8367
- } else if ((ch >> 3) == 0x1e) {
8368
- codepoint = ch & 0x07;
8369
- num_bytes = 4;
8370
- }
8371
- for (int j = 1; j < num_bytes; ++j) {
8372
- if (i + j >= len) {
8373
- return false; // incomplete UTF-8 character
8374
- }
8375
- unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8376
- if ((next_ch >> 6) != 0x02) {
8377
- return false; // invalid trailing byte
8378
- }
8379
- codepoint = (codepoint << 6) | (next_ch & 0x3f);
8380
- }
9012
+ bool is_ascii_punct(uint32_t code) {
9013
+ return code < 256 && ispunct(code);
9014
+ }
9015
+
9016
+ bool is_chinese_char(uint32_t codepoint) {
8381
9017
  if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8382
9018
  (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8383
9019
  (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@@ -8393,41 +9029,6 @@ struct llm_tokenizer_wpm {
8393
9029
  return false;
8394
9030
  }
8395
9031
 
8396
- std::string strip_accents(const std::string & input_string) {
8397
- std::string resultString;
8398
- std::map<std::string, char> accent_map = {
8399
- {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8400
- {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8401
- {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8402
- {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8403
- {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8404
- {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8405
- {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8406
- {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8407
- {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8408
- };
8409
-
8410
- for (size_t i = 0; i < input_string.length();) {
8411
- int len = utf8_len(input_string[i]);
8412
- std::string curChar = input_string.substr(i, len);
8413
- auto iter = accent_map.find(curChar);
8414
- if (iter != accent_map.end()) {
8415
- resultString += iter->second;
8416
- } else {
8417
- resultString += curChar;
8418
- }
8419
- i += len;
8420
- }
8421
-
8422
- return resultString;
8423
- }
8424
-
8425
- static size_t utf8_len(char src) {
8426
- const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8427
- uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8428
- return lookup[highbits];
8429
- }
8430
-
8431
9032
  const llama_vocab & vocab;
8432
9033
  };
8433
9034
 
@@ -9461,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
9461
10062
  }
9462
10063
  }
9463
10064
 
9464
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
9465
- llama_sample_temp(ctx, candidates_p, temp);
9466
- }
9467
-
9468
10065
  void llama_sample_repetition_penalties(
9469
10066
  struct llama_context * ctx,
9470
10067
  llama_token_data_array * candidates,
@@ -9591,38 +10188,6 @@ void llama_sample_apply_guidance(
9591
10188
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9592
10189
  }
9593
10190
 
9594
- void llama_sample_classifier_free_guidance(
9595
- struct llama_context * ctx,
9596
- llama_token_data_array * candidates,
9597
- struct llama_context * guidance_ctx,
9598
- float scale) {
9599
- GGML_ASSERT(ctx);
9600
- int64_t t_start_sample_us;
9601
-
9602
- t_start_sample_us = ggml_time_us();
9603
- const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
9604
-
9605
- GGML_ASSERT(n_vocab == candidates->size);
9606
- GGML_ASSERT(!candidates->sorted);
9607
-
9608
- std::vector<float> logits_base(n_vocab);
9609
- for (size_t i = 0; i < n_vocab; ++i) {
9610
- logits_base[i] = candidates->data[i].logit;
9611
- }
9612
-
9613
- float * logits_guidance = llama_get_logits(guidance_ctx);
9614
-
9615
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9616
- llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
9617
- t_start_sample_us = ggml_time_us();
9618
-
9619
- for (size_t i = 0; i < n_vocab; ++i) {
9620
- candidates->data[i].logit = logits_base[i];
9621
- }
9622
-
9623
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9624
- }
9625
-
9626
10191
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
9627
10192
  GGML_ASSERT(ctx);
9628
10193
 
@@ -10145,34 +10710,56 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10145
10710
  return std::make_pair(i_layer, n_layer);
10146
10711
  };
10147
10712
 
10148
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
10713
+ // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10714
+ // with the quantization of the output tensor
10715
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10716
+ (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
10149
10717
  int nx = tensor->ne[0];
10150
10718
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10151
10719
  new_type = GGML_TYPE_Q8_0;
10152
10720
  }
10153
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10721
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10722
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10154
10723
  new_type = GGML_TYPE_Q5_K;
10155
10724
  }
10156
10725
  else if (new_type != GGML_TYPE_Q8_0) {
10157
10726
  new_type = GGML_TYPE_Q6_K;
10158
10727
  }
10159
10728
  } else if (name == "token_embd.weight") {
10160
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10729
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
10730
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10161
10731
  new_type = GGML_TYPE_Q2_K;
10162
10732
  }
10733
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10734
+ new_type = GGML_TYPE_IQ3_S;
10735
+ }
10163
10736
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10164
- new_type = GGML_TYPE_Q4_K;
10737
+ new_type = GGML_TYPE_IQ3_S;
10165
10738
  }
10166
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10739
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
10740
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10167
10741
  if (name.find("attn_v.weight") != std::string::npos) {
10168
10742
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10169
- else new_type = GGML_TYPE_Q2_K;
10743
+ else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10170
10744
  ++qs.i_attention_wv;
10171
10745
  }
10746
+ else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
10747
+ new_type = GGML_TYPE_Q4_K;
10748
+ }
10172
10749
  else if (name.find("ffn_down") != std::string::npos) {
10173
- if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10750
+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
10751
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10752
+ }
10174
10753
  ++qs.i_ffn_down;
10175
10754
  }
10755
+ else if (name.find("attn_output.weight") != std::string::npos) {
10756
+ if (qs.model.hparams.n_expert == 8) {
10757
+ new_type = GGML_TYPE_Q5_K;
10758
+ } else {
10759
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10760
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
10761
+ }
10762
+ }
10176
10763
  } else if (name.find("attn_v.weight") != std::string::npos) {
10177
10764
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
10178
10765
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -10181,12 +10768,27 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10181
10768
  new_type = GGML_TYPE_Q4_K;
10182
10769
  }
10183
10770
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10184
- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
10771
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
10772
+ }
10773
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10774
+ new_type = GGML_TYPE_Q4_K;
10775
+ }
10776
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10777
+ new_type = GGML_TYPE_Q4_K;
10778
+ }
10779
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10780
+ new_type = GGML_TYPE_Q4_K;
10781
+ }
10782
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10783
+ new_type = GGML_TYPE_Q4_K;
10185
10784
  }
10186
10785
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10187
10786
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10188
10787
  }
10189
10788
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10789
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
10790
+ new_type = GGML_TYPE_Q5_K;
10791
+ }
10190
10792
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
10191
10793
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
10192
10794
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -10210,14 +10812,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10210
10812
  // TODO: explore better strategies
10211
10813
  new_type = GGML_TYPE_Q8_0;
10212
10814
  }
10213
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10214
- new_type = GGML_TYPE_Q2_K;
10815
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
10816
+ new_type = GGML_TYPE_IQ3_XXS;
10817
+ }
10818
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10819
+ new_type = GGML_TYPE_IQ2_S;
10820
+ }
10821
+ } else if (name.find("attn_q.weight") != std::string::npos) {
10822
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
10823
+ new_type = GGML_TYPE_IQ3_XXS;
10824
+ }
10825
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10826
+ new_type = GGML_TYPE_IQ2_S;
10215
10827
  }
10216
10828
  } else if (name.find("ffn_down") != std::string::npos) {
10217
10829
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10218
10830
  int i_layer = info.first, n_layer = info.second;
10219
10831
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10220
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10832
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
10221
10833
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10222
10834
  }
10223
10835
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
@@ -10228,6 +10840,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10228
10840
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10229
10841
  : GGML_TYPE_Q3_K;
10230
10842
  }
10843
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
10844
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
10845
+ new_type = GGML_TYPE_Q4_K;
10846
+ }
10231
10847
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10232
10848
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10233
10849
  }
@@ -10239,6 +10855,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10239
10855
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10240
10856
  }
10241
10857
  }
10858
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
10859
+ new_type = GGML_TYPE_Q5_K;
10860
+ }
10242
10861
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10243
10862
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
10244
10863
  new_type = GGML_TYPE_Q5_K;
@@ -10254,39 +10873,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10254
10873
  } else if (name.find("attn_output.weight") != std::string::npos) {
10255
10874
  if (arch != LLM_ARCH_FALCON) {
10256
10875
  if (qs.model.hparams.n_expert == 8) {
10257
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10258
- ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
10259
- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10876
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10877
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10878
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
10879
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
10260
10880
  new_type = GGML_TYPE_Q5_K;
10261
10881
  }
10262
10882
  } else {
10263
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10264
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10265
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
10266
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10883
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10884
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
10885
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
10886
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
10887
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
10267
10888
  }
10268
10889
  } else {
10269
10890
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10270
10891
  }
10271
10892
  }
10272
10893
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10273
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10894
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10895
+ new_type = GGML_TYPE_Q4_K;
10896
+ }
10274
10897
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10275
10898
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10276
10899
  }
10277
10900
  else if (name.find("ffn_gate") != std::string::npos) {
10278
10901
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10279
10902
  int i_layer = info.first, n_layer = info.second;
10280
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10281
- new_type = GGML_TYPE_Q2_K;
10903
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10904
+ new_type = GGML_TYPE_IQ3_XXS;
10282
10905
  }
10283
10906
  ++qs.i_ffn_gate;
10284
10907
  }
10285
10908
  else if (name.find("ffn_up") != std::string::npos) {
10286
10909
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10287
10910
  int i_layer = info.first, n_layer = info.second;
10288
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10289
- new_type = GGML_TYPE_Q2_K;
10911
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10912
+ new_type = GGML_TYPE_IQ3_XXS;
10290
10913
  }
10291
10914
  ++qs.i_ffn_up;
10292
10915
  }
@@ -10304,9 +10927,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10304
10927
  //}
10305
10928
  bool convert_incompatible_tensor = false;
10306
10929
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10307
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10308
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10309
- new_type == GGML_TYPE_IQ3_XXS) {
10930
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
10931
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
10932
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
10310
10933
  int nx = tensor->ne[0];
10311
10934
  int ny = tensor->ne[1];
10312
10935
  if (nx % QK_K != 0) {
@@ -10320,12 +10943,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10320
10943
  switch (new_type) {
10321
10944
  case GGML_TYPE_IQ2_XXS:
10322
10945
  case GGML_TYPE_IQ2_XS:
10946
+ case GGML_TYPE_IQ2_S:
10323
10947
  case GGML_TYPE_IQ3_XXS:
10324
- case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
10325
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10326
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10327
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10328
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10948
+ case GGML_TYPE_IQ3_S:
10949
+ case GGML_TYPE_IQ1_S:
10950
+ case GGML_TYPE_Q2_K:
10951
+ case GGML_TYPE_Q3_K:
10952
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
10953
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10954
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10955
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10329
10956
  default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
10330
10957
  }
10331
10958
  LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
@@ -10351,7 +10978,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10351
10978
  // K-quants
10352
10979
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10353
10980
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10354
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
10981
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
10355
10982
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10356
10983
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10357
10984
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -10362,7 +10989,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10362
10989
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10363
10990
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10364
10991
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10992
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
10993
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10365
10994
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10995
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10996
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10997
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
10998
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
10999
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
10366
11000
 
10367
11001
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10368
11002
  }
@@ -10492,7 +11126,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10492
11126
  quantize &= !params->only_copy;
10493
11127
 
10494
11128
  // do not quantize expert gating tensors
10495
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
11129
+ // NOTE: can't use LLM_TN here because the layer number is not known
11130
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10496
11131
 
10497
11132
  // do not quantize positional embeddings and token types (BERT)
10498
11133
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
@@ -10536,6 +11171,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10536
11171
  }
10537
11172
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10538
11173
  new_type == GGML_TYPE_IQ2_XS ||
11174
+ new_type == GGML_TYPE_IQ2_S ||
11175
+ new_type == GGML_TYPE_IQ1_S ||
10539
11176
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10540
11177
  LLAMA_LOG_ERROR("\n\n============================================================\n");
10541
11178
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -10770,7 +11407,7 @@ static int llama_apply_lora_from_file_internal(
10770
11407
  {
10771
11408
  LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
10772
11409
  __func__, ftype);
10773
- return false;
11410
+ return 1;
10774
11411
  }
10775
11412
  }
10776
11413
 
@@ -10956,7 +11593,7 @@ static int llama_apply_lora_from_file_internal(
10956
11593
  struct llama_model_params llama_model_default_params() {
10957
11594
  struct llama_model_params result = {
10958
11595
  /*.n_gpu_layers =*/ 0,
10959
- /*.split_mode =*/ LLAMA_SPLIT_LAYER,
11596
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
10960
11597
  /*.main_gpu =*/ 0,
10961
11598
  /*.tensor_split =*/ nullptr,
10962
11599
  /*.progress_callback =*/ nullptr,
@@ -10982,7 +11619,7 @@ struct llama_context_params llama_context_default_params() {
10982
11619
  /*.n_batch =*/ 512,
10983
11620
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
10984
11621
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
10985
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
11622
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
10986
11623
  /*.rope_freq_base =*/ 0.0f,
10987
11624
  /*.rope_freq_scale =*/ 0.0f,
10988
11625
  /*.yarn_ext_factor =*/ -1.0f,
@@ -10990,11 +11627,11 @@ struct llama_context_params llama_context_default_params() {
10990
11627
  /*.yarn_beta_fast =*/ 32.0f,
10991
11628
  /*.yarn_beta_slow =*/ 1.0f,
10992
11629
  /*.yarn_orig_ctx =*/ 0,
11630
+ /*.defrag_thold =*/ -1.0f,
10993
11631
  /*.cb_eval =*/ nullptr,
10994
11632
  /*.cb_eval_user_data =*/ nullptr,
10995
11633
  /*.type_k =*/ GGML_TYPE_F16,
10996
11634
  /*.type_v =*/ GGML_TYPE_F16,
10997
- /*.mul_mat_q =*/ true,
10998
11635
  /*.logits_all =*/ false,
10999
11636
  /*.embedding =*/ false,
11000
11637
  /*.offload_kqv =*/ true,
@@ -11050,16 +11687,7 @@ bool llama_supports_gpu_offload(void) {
11050
11687
  #endif
11051
11688
  }
11052
11689
 
11053
- // deprecated:
11054
- bool llama_mmap_supported(void) {
11055
- return llama_supports_mmap();
11056
- }
11057
-
11058
- bool llama_mlock_supported(void) {
11059
- return llama_supports_mlock();
11060
- }
11061
-
11062
- void llama_backend_init(bool numa) {
11690
+ void llama_backend_init(void) {
11063
11691
  ggml_time_init();
11064
11692
 
11065
11693
  // needed to initialize f16 tables
@@ -11069,15 +11697,17 @@ void llama_backend_init(bool numa) {
11069
11697
  ggml_free(ctx);
11070
11698
  }
11071
11699
 
11072
- if (numa) {
11073
- ggml_numa_init();
11074
- }
11075
-
11076
11700
  #ifdef GGML_USE_MPI
11077
11701
  ggml_mpi_backend_init();
11078
11702
  #endif
11079
11703
  }
11080
11704
 
11705
+ void llama_numa_init(enum ggml_numa_strategy numa) {
11706
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
11707
+ ggml_numa_init(numa);
11708
+ }
11709
+ }
11710
+
11081
11711
  void llama_backend_free(void) {
11082
11712
  #ifdef GGML_USE_MPI
11083
11713
  ggml_mpi_backend_free();
@@ -11152,7 +11782,7 @@ struct llama_context * llama_new_context_with_model(
11152
11782
  cparams.yarn_attn_factor = params.yarn_attn_factor;
11153
11783
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11154
11784
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11155
- cparams.mul_mat_q = params.mul_mat_q;
11785
+ cparams.defrag_thold = params.defrag_thold;
11156
11786
  cparams.offload_kqv = params.offload_kqv;
11157
11787
  cparams.do_pooling = params.do_pooling;
11158
11788
 
@@ -11168,16 +11798,16 @@ struct llama_context * llama_new_context_with_model(
11168
11798
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11169
11799
 
11170
11800
  auto rope_scaling_type = params.rope_scaling_type;
11171
- if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
11801
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
11172
11802
  rope_scaling_type = hparams.rope_scaling_type_train;
11173
11803
  }
11174
11804
 
11175
- if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
11805
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
11176
11806
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11177
11807
  }
11178
11808
 
11179
11809
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11180
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
11810
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11181
11811
  }
11182
11812
 
11183
11813
  if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11211,8 +11841,8 @@ struct llama_context * llama_new_context_with_model(
11211
11841
  }
11212
11842
  #elif defined(GGML_USE_CUBLAS)
11213
11843
  if (model->n_gpu_layers > 0) {
11214
- // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
11215
- if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
11844
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
11845
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
11216
11846
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11217
11847
  if (backend == nullptr) {
11218
11848
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11221,7 +11851,7 @@ struct llama_context * llama_new_context_with_model(
11221
11851
  }
11222
11852
  ctx->backends.push_back(backend);
11223
11853
  } else {
11224
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
11854
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11225
11855
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11226
11856
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11227
11857
  if (backend == nullptr) {
@@ -11274,8 +11904,7 @@ struct llama_context * llama_new_context_with_model(
11274
11904
  }
11275
11905
  ctx->backends.push_back(ctx->backend_cpu);
11276
11906
 
11277
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
11278
- cparams.n_ctx, cparams.offload_kqv)) {
11907
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
11279
11908
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11280
11909
  llama_free(ctx);
11281
11910
  return nullptr;
@@ -11309,7 +11938,7 @@ struct llama_context * llama_new_context_with_model(
11309
11938
  // graph inputs
11310
11939
  {
11311
11940
  ggml_init_params init_params = {
11312
- /* .mem_size */ ggml_tensor_overhead()*7,
11941
+ /* .mem_size */ ggml_tensor_overhead()*8,
11313
11942
  /* .mem_buffer */ nullptr,
11314
11943
  /* .no_alloc */ true,
11315
11944
  };
@@ -11319,15 +11948,19 @@ struct llama_context * llama_new_context_with_model(
11319
11948
  ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11320
11949
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11321
11950
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11951
+ ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
11322
11952
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11323
- ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11953
+ ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11954
+ ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11324
11955
 
11325
11956
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
11326
11957
  ggml_set_name(ctx->inp_embd, "inp_embd");
11327
11958
  ggml_set_name(ctx->inp_pos, "inp_pos");
11328
11959
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11960
+ ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
11329
11961
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11330
- ggml_set_name(ctx->inp_sum, "inp_sum");
11962
+ ggml_set_name(ctx->inp_mean, "inp_mean");
11963
+ ggml_set_name(ctx->inp_cls, "inp_cls");
11331
11964
 
11332
11965
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11333
11966
 
@@ -11350,7 +11983,7 @@ struct llama_context * llama_new_context_with_model(
11350
11983
  }
11351
11984
 
11352
11985
  // buffer used to store the computation graph and the tensor meta data
11353
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
11986
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11354
11987
 
11355
11988
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
11356
11989
 
@@ -11419,6 +12052,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
11419
12052
  return model->vocab.type;
11420
12053
  }
11421
12054
 
12055
+ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12056
+ switch (model->arch) {
12057
+ // these models do not use RoPE
12058
+ case LLM_ARCH_GPT2:
12059
+ case LLM_ARCH_GPTJ:
12060
+ case LLM_ARCH_GPTNEOX:
12061
+ case LLM_ARCH_MPT:
12062
+ case LLM_ARCH_REFACT:
12063
+ case LLM_ARCH_BLOOM:
12064
+ return LLAMA_ROPE_TYPE_NONE;
12065
+
12066
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
12067
+ case LLM_ARCH_LLAMA:
12068
+ case LLM_ARCH_BAICHUAN:
12069
+ case LLM_ARCH_STARCODER:
12070
+ case LLM_ARCH_PLAMO:
12071
+ case LLM_ARCH_CODESHELL:
12072
+ case LLM_ARCH_ORION:
12073
+ case LLM_ARCH_INTERNLM2:
12074
+ case LLM_ARCH_MINICPM:
12075
+ return LLAMA_ROPE_TYPE_NORM;
12076
+
12077
+ // the pairs of head values are offset by n_rot/2
12078
+ case LLM_ARCH_FALCON:
12079
+ case LLM_ARCH_PERSIMMON:
12080
+ case LLM_ARCH_BERT:
12081
+ case LLM_ARCH_NOMIC_BERT:
12082
+ case LLM_ARCH_STABLELM:
12083
+ case LLM_ARCH_QWEN:
12084
+ case LLM_ARCH_QWEN2:
12085
+ case LLM_ARCH_PHI2:
12086
+ case LLM_ARCH_GEMMA:
12087
+ return LLAMA_ROPE_TYPE_NEOX;
12088
+
12089
+ // all model arches should be listed explicitly here
12090
+ case LLM_ARCH_UNKNOWN:
12091
+ GGML_ASSERT(false && "unknown architecture");
12092
+ break;
12093
+ }
12094
+
12095
+ return LLAMA_ROPE_TYPE_NONE;
12096
+ }
12097
+
11422
12098
  int32_t llama_n_vocab(const struct llama_model * model) {
11423
12099
  return model->vocab.id_to_token.size();
11424
12100
  }
@@ -11521,15 +12197,6 @@ uint32_t llama_model_quantize(
11521
12197
  }
11522
12198
  }
11523
12199
 
11524
- int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11525
- try {
11526
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
11527
- } catch (const std::exception & err) {
11528
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
11529
- return 1;
11530
- }
11531
- }
11532
-
11533
12200
  int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11534
12201
  try {
11535
12202
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -11661,12 +12328,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
11661
12328
  llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
11662
12329
  }
11663
12330
 
11664
- void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12331
+ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
11665
12332
  if (delta == 0) {
11666
12333
  return;
11667
12334
  }
11668
12335
 
11669
- llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
12336
+ llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
11670
12337
  }
11671
12338
 
11672
12339
  void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
@@ -11677,6 +12344,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
11677
12344
  llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
11678
12345
  }
11679
12346
 
12347
+ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
12348
+ return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
12349
+ }
12350
+
12351
+ void llama_kv_cache_defrag(struct llama_context * ctx) {
12352
+ llama_kv_cache_defrag(ctx->kv_self);
12353
+ }
12354
+
12355
+ void llama_kv_cache_update(struct llama_context * ctx) {
12356
+ llama_kv_cache_update_internal(*ctx);
12357
+ }
12358
+
12359
+
11680
12360
  // Returns the *maximum* size of the state
11681
12361
  size_t llama_get_state_size(const struct llama_context * ctx) {
11682
12362
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -11803,10 +12483,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
11803
12483
  const auto & hparams = ctx->model.hparams;
11804
12484
  const auto & cparams = ctx->cparams;
11805
12485
 
11806
- const auto n_layer = hparams.n_layer;
11807
- const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
11808
- const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
11809
- const auto n_ctx = cparams.n_ctx;
12486
+ const uint32_t n_layer = hparams.n_layer;
12487
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12488
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12489
+ const uint32_t n_ctx = cparams.n_ctx;
11810
12490
 
11811
12491
  const size_t kv_buf_size = kv_self.total_size();
11812
12492
  const uint32_t kv_head = kv_self.head;
@@ -11819,18 +12499,21 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
11819
12499
  data_ctx->write(&kv_used, sizeof(kv_used));
11820
12500
 
11821
12501
  if (kv_buf_size) {
11822
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11823
-
11824
12502
  std::vector<uint8_t> tmp_buf;
11825
12503
  for (int il = 0; il < (int) n_layer; ++il) {
11826
- tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
12504
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12505
+
12506
+ tmp_buf.resize(k_size);
11827
12507
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
11828
12508
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11829
12509
 
11830
12510
  // v is not contiguous, copy row by row
11831
- tmp_buf.resize(elt_size*kv_head);
12511
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12512
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12513
+
12514
+ tmp_buf.resize(v_row_size);
11832
12515
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11833
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
12516
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
11834
12517
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
11835
12518
  }
11836
12519
  }
@@ -11860,8 +12543,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
11860
12543
  }
11861
12544
 
11862
12545
  // Sets the state reading from the specified source address
11863
- size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11864
- uint8_t * inp = src;
12546
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12547
+ const uint8_t * inp = src;
11865
12548
 
11866
12549
  // set rng
11867
12550
  {
@@ -11870,7 +12553,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11870
12553
 
11871
12554
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
11872
12555
 
11873
- std::string rng_str((char *)inp, rng_size); inp += rng_size;
12556
+ std::string rng_str((const char *)inp, rng_size); inp += rng_size;
11874
12557
 
11875
12558
  std::istringstream rng_ss(rng_str);
11876
12559
  rng_ss >> ctx->rng;
@@ -11914,10 +12597,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11914
12597
  const auto & hparams = ctx->model.hparams;
11915
12598
  const auto & cparams = ctx->cparams;
11916
12599
 
11917
- const int n_layer = hparams.n_layer;
11918
- const int n_embd_k_gqa = hparams.n_embd_k_gqa();
11919
- const int n_embd_v_gqa = hparams.n_embd_v_gqa();
11920
- const int n_ctx = cparams.n_ctx;
12600
+ const uint32_t n_layer = hparams.n_layer;
12601
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12602
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12603
+ const uint32_t n_ctx = cparams.n_ctx;
11921
12604
 
11922
12605
  size_t kv_buf_size;
11923
12606
  uint32_t kv_head;
@@ -11932,17 +12615,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
11932
12615
  if (kv_buf_size) {
11933
12616
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
11934
12617
 
11935
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
11936
-
11937
12618
  for (int il = 0; il < (int) n_layer; ++il) {
11938
- size_t k_size = elt_size*n_embd_k_gqa*kv_head;
12619
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12620
+
11939
12621
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
11940
12622
  inp += k_size;
11941
12623
 
11942
12624
  // v is not contiguous, copy row by row
11943
- size_t v_row_size = elt_size*kv_head;
12625
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12626
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12627
+
11944
12628
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
11945
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
12629
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
11946
12630
  inp += v_row_size;
11947
12631
  }
11948
12632
  }
@@ -12062,38 +12746,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
12062
12746
  return true;
12063
12747
  }
12064
12748
 
12065
- int llama_eval(
12066
- struct llama_context * ctx,
12067
- llama_token * tokens,
12068
- int32_t n_tokens,
12069
- int32_t n_past) {
12070
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12071
-
12072
- const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
12073
- if (ret < 0) {
12074
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12075
- }
12076
-
12077
- return ret;
12078
- }
12079
-
12080
- int llama_eval_embd(
12081
- struct llama_context * ctx,
12082
- float * embd,
12083
- int32_t n_tokens,
12084
- int32_t n_past) {
12085
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12086
-
12087
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
12088
-
12089
- const int ret = llama_decode_internal(*ctx, batch);
12090
- if (ret < 0) {
12091
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12092
- }
12093
-
12094
- return ret;
12095
- }
12096
-
12097
12749
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
12098
12750
  ctx->cparams.n_threads = n_threads;
12099
12751
  ctx->cparams.n_threads_batch = n_threads_batch;
@@ -12332,6 +12984,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12332
12984
  return 0;
12333
12985
  }
12334
12986
 
12987
+ // trim whitespace from the beginning and end of a string
12988
+ static std::string trim(const std::string & str) {
12989
+ size_t start = 0;
12990
+ size_t end = str.size();
12991
+ while (start < end && isspace(str[start])) {
12992
+ start += 1;
12993
+ }
12994
+ while (end > start && isspace(str[end - 1])) {
12995
+ end -= 1;
12996
+ }
12997
+ return str.substr(start, end - start);
12998
+ }
12999
+
13000
+ // Simple version of "llama_apply_chat_template" that only works with strings
13001
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
13002
+ static int32_t llama_chat_apply_template_internal(
13003
+ const std::string & tmpl,
13004
+ const std::vector<const llama_chat_message *> & chat,
13005
+ std::string & dest, bool add_ass) {
13006
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
13007
+ std::stringstream ss;
13008
+ if (tmpl.find("<|im_start|>") != std::string::npos) {
13009
+ // chatml template
13010
+ for (auto message : chat) {
13011
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
13012
+ }
13013
+ if (add_ass) {
13014
+ ss << "<|im_start|>assistant\n";
13015
+ }
13016
+ } else if (tmpl.find("[INST]") != std::string::npos) {
13017
+ // llama2 template and its variants
13018
+ // [variant] support system message
13019
+ bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
13020
+ // [variant] space before + after response
13021
+ bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
13022
+ // [variant] add BOS inside history
13023
+ bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
13024
+ // [variant] trim spaces from the input message
13025
+ bool strip_message = tmpl.find("content.strip()") != std::string::npos;
13026
+ // construct the prompt
13027
+ bool is_inside_turn = true; // skip BOS at the beginning
13028
+ ss << "[INST] ";
13029
+ for (auto message : chat) {
13030
+ std::string content = strip_message ? trim(message->content) : message->content;
13031
+ std::string role(message->role);
13032
+ if (!is_inside_turn) {
13033
+ is_inside_turn = true;
13034
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
13035
+ }
13036
+ if (role == "system") {
13037
+ if (support_system_message) {
13038
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
13039
+ } else {
13040
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
13041
+ ss << content << "\n";
13042
+ }
13043
+ } else if (role == "user") {
13044
+ ss << content << " [/INST]";
13045
+ } else {
13046
+ ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
13047
+ is_inside_turn = false;
13048
+ }
13049
+ }
13050
+ // llama2 templates seem to not care about "add_generation_prompt"
13051
+ } else if (tmpl.find("<|user|>") != std::string::npos) {
13052
+ // zephyr template
13053
+ for (auto message : chat) {
13054
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
13055
+ }
13056
+ if (add_ass) {
13057
+ ss << "<|assistant|>\n";
13058
+ }
13059
+ } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
13060
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
13061
+ for (auto message : chat) {
13062
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
13063
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
13064
+ }
13065
+ if (add_ass) {
13066
+ ss << "<s>assistant\n";
13067
+ }
13068
+ } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
13069
+ // google/gemma-7b-it
13070
+ std::string system_prompt = "";
13071
+ for (auto message : chat) {
13072
+ std::string role(message->role);
13073
+ if (role == "system") {
13074
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
13075
+ system_prompt = trim(message->content);
13076
+ continue;
13077
+ }
13078
+ // in gemma, "assistant" is "model"
13079
+ role = role == "assistant" ? "model" : message->role;
13080
+ ss << "<start_of_turn>" << role << "\n";
13081
+ if (!system_prompt.empty() && role != "model") {
13082
+ ss << system_prompt << "\n\n";
13083
+ system_prompt = "";
13084
+ }
13085
+ ss << trim(message->content) << "<end_of_turn>\n";
13086
+ }
13087
+ if (add_ass) {
13088
+ ss << "<start_of_turn>model\n";
13089
+ }
13090
+ } else {
13091
+ // template not supported
13092
+ return -1;
13093
+ }
13094
+ dest = ss.str();
13095
+ return dest.size();
13096
+ }
13097
+
13098
+ LLAMA_API int32_t llama_chat_apply_template(
13099
+ const struct llama_model * model,
13100
+ const char * tmpl,
13101
+ const struct llama_chat_message * chat,
13102
+ size_t n_msg,
13103
+ bool add_ass,
13104
+ char * buf,
13105
+ int32_t length) {
13106
+ std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
13107
+ if (tmpl == nullptr) {
13108
+ GGML_ASSERT(model != nullptr);
13109
+ // load template from model
13110
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
13111
+ std::string template_key = "tokenizer.chat_template";
13112
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
13113
+ if (res < 0) {
13114
+ // worst case: there is no information about template, we will use chatml by default
13115
+ curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
13116
+ } else {
13117
+ curr_tmpl = std::string(model_template.data(), model_template.size());
13118
+ }
13119
+ }
13120
+ // format the chat to string
13121
+ std::vector<const llama_chat_message *> chat_vec;
13122
+ chat_vec.resize(n_msg);
13123
+ for (size_t i = 0; i < n_msg; i++) {
13124
+ chat_vec[i] = &chat[i];
13125
+ }
13126
+ std::string formatted_chat;
13127
+ int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
13128
+ if (res < 0) {
13129
+ return res;
13130
+ }
13131
+ strncpy(buf, formatted_chat.c_str(), length);
13132
+ return res;
13133
+ }
13134
+
12335
13135
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
12336
13136
  struct llama_timings result = {
12337
13137
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,