llama_cpp 0.12.7 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -68,10 +68,12 @@
68
68
  #include <cstdio>
69
69
  #include <cstring>
70
70
  #include <ctime>
71
+ #include <cwctype>
71
72
  #include <forward_list>
72
73
  #include <fstream>
73
74
  #include <functional>
74
75
  #include <initializer_list>
76
+ #include <locale>
75
77
  #include <map>
76
78
  #include <memory>
77
79
  #include <mutex>
@@ -102,6 +104,7 @@
102
104
  #define LLAMA_MAX_NODES 8192
103
105
  #define LLAMA_MAX_EXPERTS 8
104
106
 
107
+
105
108
  //
106
109
  // logging
107
110
  //
@@ -209,10 +212,11 @@ enum llm_arch {
209
212
  LLM_ARCH_INTERNLM2,
210
213
  LLM_ARCH_MINICPM,
211
214
  LLM_ARCH_GEMMA,
215
+ LLM_ARCH_STARCODER2,
212
216
  LLM_ARCH_UNKNOWN,
213
217
  };
214
218
 
215
- static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
219
+ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
216
220
  { LLM_ARCH_LLAMA, "llama" },
217
221
  { LLM_ARCH_FALCON, "falcon" },
218
222
  { LLM_ARCH_GPT2, "gpt2" },
@@ -236,6 +240,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
236
240
  { LLM_ARCH_INTERNLM2, "internlm2" },
237
241
  { LLM_ARCH_MINICPM, "minicpm" },
238
242
  { LLM_ARCH_GEMMA, "gemma" },
243
+ { LLM_ARCH_STARCODER2, "starcoder2" },
244
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
239
245
  };
240
246
 
241
247
  enum llm_kv {
@@ -296,7 +302,7 @@ enum llm_kv {
296
302
  LLM_KV_TOKENIZER_RWKV,
297
303
  };
298
304
 
299
- static std::map<llm_kv, const char *> LLM_KV_NAMES = {
305
+ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
300
306
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
301
307
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
302
308
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -360,7 +366,7 @@ struct LLM_KV {
360
366
  llm_arch arch;
361
367
 
362
368
  std::string operator()(llm_kv kv) const {
363
- return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
369
+ return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
364
370
  }
365
371
  };
366
372
 
@@ -395,7 +401,7 @@ enum llm_tensor {
395
401
  LLM_TENSOR_LAYER_OUT_NORM,
396
402
  };
397
403
 
398
- static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
404
+ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
399
405
  {
400
406
  LLM_ARCH_LLAMA,
401
407
  {
@@ -777,6 +783,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
777
783
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
778
784
  },
779
785
  },
786
+ {
787
+ LLM_ARCH_STARCODER2,
788
+ {
789
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
790
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
791
+ { LLM_TENSOR_OUTPUT, "output" },
792
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
793
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
794
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
795
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
796
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
797
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
798
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
799
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
800
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
801
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
802
+ },
803
+ },
780
804
  {
781
805
  LLM_ARCH_UNKNOWN,
782
806
  {
@@ -810,38 +834,38 @@ struct LLM_TN {
810
834
  llm_arch arch;
811
835
 
812
836
  std::string operator()(llm_tensor tensor) const {
813
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
837
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
814
838
  return "__missing__";
815
839
  }
816
- return LLM_TENSOR_NAMES[arch].at(tensor);
840
+ return LLM_TENSOR_NAMES.at(arch).at(tensor);
817
841
  }
818
842
 
819
843
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
820
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
844
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
821
845
  return "__missing__";
822
846
  }
823
- return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
847
+ return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
824
848
  }
825
849
 
826
850
  std::string operator()(llm_tensor tensor, int bid) const {
827
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
851
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
828
852
  return "__missing__";
829
853
  }
830
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
854
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
831
855
  }
832
856
 
833
857
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
834
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
858
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
835
859
  return "__missing__";
836
860
  }
837
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
861
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
838
862
  }
839
863
 
840
864
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
841
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
865
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
842
866
  return "__missing__";
843
867
  }
844
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
868
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
845
869
  }
846
870
  };
847
871
 
@@ -849,20 +873,20 @@ struct LLM_TN {
849
873
  // gguf helpers
850
874
  //
851
875
 
852
- static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
853
- { LLAMA_ROPE_SCALING_NONE, "none" },
854
- { LLAMA_ROPE_SCALING_LINEAR, "linear" },
855
- { LLAMA_ROPE_SCALING_YARN, "yarn" },
876
+ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
877
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
878
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
879
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
856
880
  };
857
881
 
858
- static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
882
+ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
859
883
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
860
884
  if (kv.second == name) {
861
- return kv.first;
885
+ return (llama_rope_scaling_type) kv.first;
862
886
  }
863
887
  }
864
888
 
865
- return LLAMA_ROPE_SCALING_UNSPECIFIED;
889
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
866
890
  }
867
891
 
868
892
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1407,7 +1431,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1407
1431
  buft = ggml_backend_cuda_host_buffer_type();
1408
1432
  }
1409
1433
  #elif defined(GGML_USE_SYCL)
1410
- buft = ggml_backend_sycl_host_buffer_type();
1434
+ if (host_buffer) {
1435
+ buft = ggml_backend_sycl_host_buffer_type();
1436
+ }
1411
1437
  #elif defined(GGML_USE_CPU_HBM)
1412
1438
  buft = ggml_backend_cpu_hbm_buffer_type();
1413
1439
  #elif defined(GGML_USE_VULKAN)
@@ -1461,6 +1487,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1461
1487
  }
1462
1488
  #endif
1463
1489
 
1490
+ #ifdef GGML_USE_SYCL
1491
+ if (ggml_backend_sycl_get_device_count() > 1) {
1492
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1493
+ }
1494
+ #endif
1495
+
1464
1496
  if (buft == nullptr) {
1465
1497
  buft = llama_default_buffer_type_offload(fallback_gpu);
1466
1498
  }
@@ -1472,6 +1504,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1472
1504
  static size_t llama_get_device_count() {
1473
1505
  #if defined(GGML_USE_CUBLAS)
1474
1506
  return ggml_backend_cuda_get_device_count();
1507
+ #elif defined(GGML_USE_SYCL)
1508
+ return ggml_backend_sycl_get_device_count();
1475
1509
  #elif defined(GGML_USE_VULKAN)
1476
1510
  return ggml_backend_vk_get_device_count();
1477
1511
  #else
@@ -1485,6 +1519,11 @@ static size_t llama_get_device_memory(int device) {
1485
1519
  size_t free;
1486
1520
  ggml_backend_cuda_get_device_memory(device, &total, &free);
1487
1521
  return free;
1522
+ #elif defined(GGML_USE_SYCL)
1523
+ size_t total;
1524
+ size_t free;
1525
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
1526
+ return free;
1488
1527
  #elif defined(GGML_USE_VULKAN)
1489
1528
  size_t total;
1490
1529
  size_t free;
@@ -1550,8 +1589,9 @@ static const size_t MiB = 1024*kiB;
1550
1589
  static const size_t GiB = 1024*MiB;
1551
1590
 
1552
1591
  struct llama_hparams {
1553
- bool vocab_only;
1554
- bool rope_finetuned;
1592
+ bool vocab_only;
1593
+ bool rope_finetuned;
1594
+
1555
1595
  uint32_t n_vocab;
1556
1596
  uint32_t n_ctx_train; // context size the model was trained on
1557
1597
  uint32_t n_embd;
@@ -1572,7 +1612,6 @@ struct llama_hparams {
1572
1612
  float rope_freq_base_train;
1573
1613
  float rope_freq_scale_train;
1574
1614
  uint32_t n_yarn_orig_ctx;
1575
- int32_t rope_scaling_type_train;
1576
1615
 
1577
1616
  float f_clamp_kqv = 0.0f;
1578
1617
  float f_max_alibi_bias = 0.0f;
@@ -1580,7 +1619,9 @@ struct llama_hparams {
1580
1619
  bool causal_attn = true;
1581
1620
  bool need_kq_pos = false;
1582
1621
 
1583
- uint32_t pooling_type = LLAMA_POOLING_NONE;
1622
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1623
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1624
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
1584
1625
 
1585
1626
  bool operator!=(const llama_hparams & other) const {
1586
1627
  if (this->vocab_only != other.vocab_only) return true;
@@ -1624,13 +1665,13 @@ struct llama_hparams {
1624
1665
  };
1625
1666
 
1626
1667
  struct llama_cparams {
1627
- uint32_t n_ctx; // context size used during inference
1668
+ uint32_t n_ctx; // context size used during inference
1628
1669
  uint32_t n_batch;
1629
1670
  uint32_t n_threads; // number of threads to use for generation
1630
1671
  uint32_t n_threads_batch; // number of threads to use for batch processing
1631
1672
 
1632
- float rope_freq_base;
1633
- float rope_freq_scale;
1673
+ float rope_freq_base;
1674
+ float rope_freq_scale;
1634
1675
 
1635
1676
  uint32_t n_yarn_orig_ctx;
1636
1677
  // These hyperparameters are not exposed in GGUF, because all
@@ -1639,10 +1680,12 @@ struct llama_cparams {
1639
1680
  float yarn_attn_factor;
1640
1681
  float yarn_beta_fast;
1641
1682
  float yarn_beta_slow;
1683
+ float defrag_thold;
1642
1684
 
1643
- bool mul_mat_q;
1685
+ bool embeddings;
1644
1686
  bool offload_kqv;
1645
- bool do_pooling;
1687
+
1688
+ enum llama_pooling_type pooling_type;
1646
1689
 
1647
1690
  ggml_backend_sched_eval_callback cb_eval;
1648
1691
  void * cb_eval_user_data;
@@ -1707,11 +1750,20 @@ struct llama_kv_cell {
1707
1750
  bool has_seq_id(const llama_seq_id & id) const {
1708
1751
  return seq_id.find(id) != seq_id.end();
1709
1752
  }
1753
+
1754
+ bool is_empty() const {
1755
+ return seq_id.empty();
1756
+ }
1757
+
1758
+ bool is_same_seq(const llama_kv_cell & other) const {
1759
+ return seq_id == other.seq_id;
1760
+ }
1710
1761
  };
1711
1762
 
1712
1763
  // ring-buffer of cached KV data
1713
1764
  struct llama_kv_cache {
1714
1765
  bool has_shift = false;
1766
+ bool do_defrag = false;
1715
1767
 
1716
1768
  // Note: The value of head isn't only used to optimize searching
1717
1769
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1723,6 +1775,9 @@ struct llama_kv_cache {
1723
1775
  // computed before each graph build
1724
1776
  uint32_t n = 0;
1725
1777
 
1778
+ ggml_type type_k = GGML_TYPE_F16;
1779
+ ggml_type type_v = GGML_TYPE_F16;
1780
+
1726
1781
  std::vector<llama_kv_cell> cells;
1727
1782
 
1728
1783
  std::vector<struct ggml_tensor *> k_l; // per layer
@@ -1919,7 +1974,7 @@ struct llama_context {
1919
1974
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1920
1975
  int32_t n_eval = 0; // number of eval calls
1921
1976
 
1922
- // decode output (2-dimensional array: [n_tokens][n_vocab])
1977
+ // logits output (2-dimensional array: [n_tokens][n_vocab])
1923
1978
  std::vector<float> logits;
1924
1979
  #ifndef NDEBUG
1925
1980
  // guard against access to unset logits
@@ -1927,13 +1982,21 @@ struct llama_context {
1927
1982
  #endif
1928
1983
  bool logits_all = false;
1929
1984
 
1930
- // input embedding (1-dimensional array: [n_embd])
1931
- std::vector<float> embedding;
1985
+ // embeddings output (2-dimensional array: [n_tokens][n_embd])
1986
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
1987
+ std::vector<float> embd;
1988
+
1989
+ // sequence embeddings output (map of [n_embd] vectors)
1990
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
1991
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
1932
1992
 
1933
1993
  // memory buffers used to evaluate the model
1934
1994
  std::vector<uint8_t> buf_compute_meta;
1935
1995
  ggml_backend_sched_t sched = nullptr;
1936
1996
 
1997
+ ggml_abort_callback abort_callback = nullptr;
1998
+ void * abort_callback_data = nullptr;
1999
+
1937
2000
  // input tensors
1938
2001
  ggml_backend_buffer_t buf_input = nullptr;
1939
2002
  ggml_context * ctx_input = nullptr;
@@ -1958,8 +2021,8 @@ struct llama_context {
1958
2021
  static bool llama_kv_cache_init(
1959
2022
  struct llama_kv_cache & cache,
1960
2023
  const llama_model & model,
1961
- ggml_type ktype,
1962
- ggml_type vtype,
2024
+ ggml_type type_k,
2025
+ ggml_type type_v,
1963
2026
  uint32_t n_ctx,
1964
2027
  bool offload) {
1965
2028
  const struct llama_hparams & hparams = model.hparams;
@@ -1974,6 +2037,9 @@ static bool llama_kv_cache_init(
1974
2037
  cache.size = n_ctx;
1975
2038
  cache.used = 0;
1976
2039
 
2040
+ cache.type_k = type_k;
2041
+ cache.type_v = type_v;
2042
+
1977
2043
  cache.cells.clear();
1978
2044
  cache.cells.resize(n_ctx);
1979
2045
 
@@ -2014,8 +2080,8 @@ static bool llama_kv_cache_init(
2014
2080
 
2015
2081
  for (int i = 0; i < (int) n_layer; i++) {
2016
2082
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
2017
- ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
2018
- ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
2083
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2084
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
2019
2085
  ggml_format_name(k, "cache_k_l%d", i);
2020
2086
  ggml_format_name(v, "cache_v_l%d", i);
2021
2087
  cache.k_l.push_back(k);
@@ -2097,10 +2163,12 @@ static bool llama_kv_cache_find_slot(
2097
2163
  }
2098
2164
 
2099
2165
  // find how many cells are currently in use
2100
- static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2101
- for (uint32_t i = cache.size - 1; i > 0; --i) {
2102
- if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
2103
- return i + 1;
2166
+ static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2167
+ for (uint32_t i = cache.size; i > 0; --i) {
2168
+ const llama_kv_cell & cell = cache.cells[i - 1];
2169
+
2170
+ if (cell.pos >= 0 && !cell.is_empty()) {
2171
+ return i;
2104
2172
  }
2105
2173
  }
2106
2174
 
@@ -2135,7 +2203,7 @@ static void llama_kv_cache_seq_rm(
2135
2203
  } else {
2136
2204
  continue;
2137
2205
  }
2138
- if (cache.cells[i].seq_id.empty()) {
2206
+ if (cache.cells[i].is_empty()) {
2139
2207
  // keep count of the number of used cells
2140
2208
  if (cache.cells[i].pos >= 0) cache.used--;
2141
2209
 
@@ -2186,7 +2254,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
2186
2254
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2187
2255
  }
2188
2256
 
2189
- static void llama_kv_cache_seq_shift(
2257
+ static void llama_kv_cache_seq_add(
2190
2258
  struct llama_kv_cache & cache,
2191
2259
  llama_seq_id seq_id,
2192
2260
  llama_pos p0,
@@ -2204,10 +2272,14 @@ static void llama_kv_cache_seq_shift(
2204
2272
  cache.cells[i].delta += delta;
2205
2273
 
2206
2274
  if (cache.cells[i].pos < 0) {
2207
- if (!cache.cells[i].seq_id.empty()) cache.used--;
2275
+ if (!cache.cells[i].is_empty()) {
2276
+ cache.used--;
2277
+ }
2208
2278
  cache.cells[i].pos = -1;
2209
2279
  cache.cells[i].seq_id.clear();
2210
- if (new_head == cache.size) new_head = i;
2280
+ if (new_head == cache.size) {
2281
+ new_head = i;
2282
+ }
2211
2283
  }
2212
2284
  }
2213
2285
  }
@@ -2239,6 +2311,22 @@ static void llama_kv_cache_seq_div(
2239
2311
  }
2240
2312
  }
2241
2313
 
2314
+ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
2315
+ llama_pos result = 0;
2316
+
2317
+ for (uint32_t i = 0; i < cache.size; ++i) {
2318
+ if (cache.cells[i].has_seq_id(seq_id)) {
2319
+ result = std::max(result, cache.cells[i].pos);
2320
+ }
2321
+ }
2322
+
2323
+ return result;
2324
+ }
2325
+
2326
+ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2327
+ cache.do_defrag = true;
2328
+ }
2329
+
2242
2330
  //
2243
2331
  // model loading and saving
2244
2332
  //
@@ -2310,7 +2398,7 @@ namespace GGUFMeta {
2310
2398
  }
2311
2399
  };
2312
2400
 
2313
- struct ArrayInfo{
2401
+ struct ArrayInfo {
2314
2402
  const gguf_type gt;
2315
2403
  const size_t length;
2316
2404
  const void * data;
@@ -2329,7 +2417,7 @@ namespace GGUFMeta {
2329
2417
  };
2330
2418
 
2331
2419
  template<typename T>
2332
- class GKV: public GKV_Base<T> {
2420
+ class GKV : public GKV_Base<T> {
2333
2421
  GKV() = delete;
2334
2422
 
2335
2423
  public:
@@ -2345,46 +2433,46 @@ namespace GGUFMeta {
2345
2433
 
2346
2434
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2347
2435
  switch (ty) {
2348
- case LLAMA_KV_OVERRIDE_BOOL: return "bool";
2349
- case LLAMA_KV_OVERRIDE_INT: return "int";
2350
- case LLAMA_KV_OVERRIDE_FLOAT: return "float";
2436
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2437
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2438
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2351
2439
  }
2352
2440
  return "unknown";
2353
2441
  }
2354
2442
 
2355
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
2356
- if (!override) { return false; }
2357
- if (override->tag == expected_type) {
2443
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
2444
+ if (!ovrd) { return false; }
2445
+ if (ovrd->tag == expected_type) {
2358
2446
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2359
- __func__, override_type_to_str(override->tag), override->key);
2360
- switch (override->tag) {
2361
- case LLAMA_KV_OVERRIDE_BOOL: {
2362
- LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2447
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
2448
+ switch (ovrd->tag) {
2449
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2450
+ LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2363
2451
  } break;
2364
- case LLAMA_KV_OVERRIDE_INT: {
2365
- LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2452
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
2453
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2366
2454
  } break;
2367
- case LLAMA_KV_OVERRIDE_FLOAT: {
2368
- LLAMA_LOG_INFO("%.6f\n", override->float_value);
2455
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2456
+ LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2369
2457
  } break;
2370
2458
  default:
2371
2459
  // Shouldn't be possible to end up here, but just in case...
2372
2460
  throw std::runtime_error(
2373
2461
  format("Unsupported attempt to override %s type for metadata key %s\n",
2374
- override_type_to_str(override->tag), override->key));
2462
+ override_type_to_str(ovrd->tag), ovrd->key));
2375
2463
  }
2376
2464
  return true;
2377
2465
  }
2378
2466
  LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
2379
- __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
2467
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
2380
2468
  return false;
2381
2469
  }
2382
2470
 
2383
2471
  template<typename OT>
2384
2472
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2385
- try_override(OT & target, const struct llama_model_kv_override *override) {
2386
- if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
2387
- target = override->bool_value;
2473
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2474
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2475
+ target = ovrd->bool_value;
2388
2476
  return true;
2389
2477
  }
2390
2478
  return false;
@@ -2392,9 +2480,9 @@ namespace GGUFMeta {
2392
2480
 
2393
2481
  template<typename OT>
2394
2482
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2395
- try_override(OT & target, const struct llama_model_kv_override *override) {
2396
- if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
2397
- target = override->int_value;
2483
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2484
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2485
+ target = ovrd->int_value;
2398
2486
  return true;
2399
2487
  }
2400
2488
  return false;
@@ -2402,9 +2490,9 @@ namespace GGUFMeta {
2402
2490
 
2403
2491
  template<typename OT>
2404
2492
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2405
- try_override(T & target, const struct llama_model_kv_override *override) {
2406
- if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
2407
- target = override->float_value;
2493
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2494
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2495
+ target = ovrd->float_value;
2408
2496
  return true;
2409
2497
  }
2410
2498
  return false;
@@ -2412,17 +2500,17 @@ namespace GGUFMeta {
2412
2500
 
2413
2501
  template<typename OT>
2414
2502
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2415
- try_override(T & target, const struct llama_model_kv_override *override) {
2503
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2416
2504
  (void)target;
2417
- (void)override;
2418
- if (!override) { return false; }
2505
+ (void)ovrd;
2506
+ if (!ovrd) { return false; }
2419
2507
  // Currently, we should never end up here so it would be a bug if we do.
2420
2508
  throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2421
- override ? override->key : "NULL"));
2509
+ ovrd ? ovrd->key : "NULL"));
2422
2510
  }
2423
2511
 
2424
- static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
2425
- if (try_override<T>(target, override)) {
2512
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2513
+ if (try_override<T>(target, ovrd)) {
2426
2514
  return true;
2427
2515
  }
2428
2516
  if (k < 0) { return false; }
@@ -2430,12 +2518,12 @@ namespace GGUFMeta {
2430
2518
  return true;
2431
2519
  }
2432
2520
 
2433
- static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
2434
- return set(ctx, gguf_find_key(ctx, key), target, override);
2521
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2522
+ return set(ctx, gguf_find_key(ctx, key), target, ovrd);
2435
2523
  }
2436
2524
 
2437
- static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
2438
- return set(ctx, key.c_str(), target, override);
2525
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2526
+ return set(ctx, key.c_str(), target, ovrd);
2439
2527
  }
2440
2528
  };
2441
2529
  }
@@ -2542,9 +2630,12 @@ struct llama_model_loader {
2542
2630
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2543
2631
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2544
2632
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2633
+ case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2545
2634
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
2635
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
2636
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2637
+ case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2638
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
2548
2639
  default:
2549
2640
  {
2550
2641
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2845,6 +2936,19 @@ struct llama_model_loader {
2845
2936
  }
2846
2937
  };
2847
2938
 
2939
+ template<>
2940
+ bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2941
+ uint32_t tmp;
2942
+ const bool found = get_key(kid, tmp, required);
2943
+ if (found) {
2944
+ result = (enum llama_pooling_type) tmp;
2945
+ } else {
2946
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
2947
+ }
2948
+ return found;
2949
+ }
2950
+
2951
+
2848
2952
  //
2849
2953
  // load LLaMA models
2850
2954
  //
@@ -2886,10 +2990,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2886
2990
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2887
2991
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2888
2992
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2889
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2993
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
2994
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
2995
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
2890
2996
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
2997
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
2998
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2999
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3000
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
3001
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
2893
3002
 
2894
3003
  default: return "unknown, may not work";
2895
3004
  }
@@ -2923,16 +3032,16 @@ static const char * llama_model_type_name(e_model type) {
2923
3032
  default: return "?B";
2924
3033
  }
2925
3034
  }
3035
+
2926
3036
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2927
3037
  switch (type) {
2928
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2929
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2930
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2931
- default: return "unknown";
3038
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
3039
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
3040
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
3041
+ default: return "unknown";
2932
3042
  }
2933
3043
  }
2934
3044
 
2935
-
2936
3045
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2937
3046
  model.arch = ml.get_arch();
2938
3047
  if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -2996,7 +3105,7 @@ static void llm_load_hparams(
2996
3105
  std::string rope_scaling("linear");
2997
3106
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2998
3107
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2999
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
3108
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
3000
3109
 
3001
3110
  // rope_freq_scale (inverse of the kv) is optional
3002
3111
  float ropescale = 0.0f;
@@ -3109,10 +3218,10 @@ static void llm_load_hparams(
3109
3218
  } break;
3110
3219
  case LLM_ARCH_BERT:
3111
3220
  {
3112
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3113
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3221
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3222
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3114
3223
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3115
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3224
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
3116
3225
 
3117
3226
  switch (hparams.n_layer) {
3118
3227
  case 3:
@@ -3130,10 +3239,10 @@ static void llm_load_hparams(
3130
3239
  } break;
3131
3240
  case LLM_ARCH_NOMIC_BERT:
3132
3241
  {
3133
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3242
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3243
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
3244
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3245
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
3246
 
3138
3247
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
3248
  model.type = e_model::MODEL_137M;
@@ -3264,6 +3373,16 @@ static void llm_load_hparams(
3264
3373
  default: model.type = e_model::MODEL_UNKNOWN;
3265
3374
  }
3266
3375
  } break;
3376
+ case LLM_ARCH_STARCODER2:
3377
+ {
3378
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3379
+ switch (hparams.n_layer) {
3380
+ case 30: model.type = e_model::MODEL_3B; break;
3381
+ case 32: model.type = e_model::MODEL_7B; break;
3382
+ case 40: model.type = e_model::MODEL_15B; break;
3383
+ default: model.type = e_model::MODEL_UNKNOWN;
3384
+ }
3385
+ } break;
3267
3386
  default: (void)0;
3268
3387
  }
3269
3388
 
@@ -3272,6 +3391,8 @@ static void llm_load_hparams(
3272
3391
  if (hparams.f_max_alibi_bias > 0.0f) {
3273
3392
  hparams.need_kq_pos = true;
3274
3393
  }
3394
+
3395
+ hparams.rope_type = llama_rope_type(&model);
3275
3396
  }
3276
3397
 
3277
3398
  // TODO: This should probably be in llama.h
@@ -3574,6 +3695,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3574
3695
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3575
3696
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3576
3697
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3698
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3699
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3577
3700
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3578
3701
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3579
3702
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3640,7 +3763,7 @@ static bool llm_load_tensors(
3640
3763
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3641
3764
  }
3642
3765
 
3643
- if (split_mode == LLAMA_SPLIT_LAYER) {
3766
+ if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
3644
3767
  // calculate the split points
3645
3768
  int device_count = llama_get_device_count();
3646
3769
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3679,10 +3802,10 @@ static bool llm_load_tensors(
3679
3802
  }
3680
3803
  } else {
3681
3804
  ggml_backend_buffer_type_t split_buft;
3682
- if (split_mode == LLAMA_SPLIT_ROW) {
3805
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
3683
3806
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3684
3807
  } else {
3685
- // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3808
+ // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3686
3809
  split_buft = llama_default_buffer_type_offload(main_gpu);
3687
3810
  }
3688
3811
  // assign the repeating layers
@@ -4430,6 +4553,56 @@ static bool llm_load_tensors(
4430
4553
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4431
4554
  }
4432
4555
  } break;
4556
+ case LLM_ARCH_STARCODER2:
4557
+ {
4558
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4559
+
4560
+ // output
4561
+ {
4562
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4563
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4564
+
4565
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4566
+ // if output is NULL, init from the input tok embed
4567
+ if (model.output == NULL) {
4568
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4569
+ ml.n_created--; // artificial tensor
4570
+ ml.size_data += ggml_nbytes(model.output);
4571
+ }
4572
+
4573
+ }
4574
+
4575
+ for (int i = 0; i < n_layer; ++i) {
4576
+ ggml_context * ctx_layer = ctx_for_layer(i);
4577
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4578
+
4579
+ auto & layer = model.layers[i];
4580
+
4581
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4582
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4583
+
4584
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4585
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4586
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4587
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4588
+
4589
+ // optional bias tensors
4590
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4591
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4592
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4593
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4594
+
4595
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4596
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4597
+
4598
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4599
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4600
+
4601
+ // optional bias tensors
4602
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4603
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
4604
+ }
4605
+ } break;
4433
4606
  default:
4434
4607
  throw std::runtime_error("unknown architecture");
4435
4608
  }
@@ -4595,12 +4768,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4595
4768
 
4596
4769
  using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
4597
4770
 
4598
- enum llm_rope_type {
4599
- LLM_ROPE,
4600
- LLM_ROPE_NEOX,
4601
- LLM_ROPE_GLM,
4602
- };
4603
-
4604
4771
  enum llm_ffn_op_type {
4605
4772
  LLM_FFN_SILU,
4606
4773
  LLM_FFN_GELU,
@@ -4646,55 +4813,6 @@ static struct ggml_tensor * llm_build_inp_embd(
4646
4813
  return inpL;
4647
4814
  }
4648
4815
 
4649
- // Persimmon: n_rot = n_embd_head_k/2
4650
- // Other: n_rot = n_embd_head_k
4651
- static void llm_build_k_shift(
4652
- struct ggml_context * ctx,
4653
- const llama_hparams & hparams,
4654
- const llama_cparams & cparams,
4655
- const llama_kv_cache & kv,
4656
- struct ggml_cgraph * graph,
4657
- struct ggml_tensor * K_shift,
4658
- llm_rope_type type,
4659
- int64_t n_ctx,
4660
- float freq_base,
4661
- float freq_scale,
4662
- const llm_build_cb & cb) {
4663
- const int64_t n_layer = hparams.n_layer;
4664
- const int64_t n_head_kv = hparams.n_head_kv;
4665
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
4666
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4667
- const int32_t n_rot = hparams.n_rot;
4668
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4669
- const float ext_factor = cparams.yarn_ext_factor;
4670
- const float attn_factor = cparams.yarn_attn_factor;
4671
- const float beta_fast = cparams.yarn_beta_fast;
4672
- const float beta_slow = cparams.yarn_beta_slow;
4673
-
4674
- int rope_type = 0;
4675
-
4676
- switch (type) {
4677
- case LLM_ROPE: rope_type = 0; break;
4678
- case LLM_ROPE_NEOX: rope_type = 2; break;
4679
- case LLM_ROPE_GLM: rope_type = 4; break;
4680
- }
4681
-
4682
- for (int il = 0; il < n_layer; ++il) {
4683
- struct ggml_tensor * tmp =
4684
- // we rotate only the first n_rot dimensions
4685
- ggml_rope_custom_inplace(ctx,
4686
- ggml_view_3d(ctx, kv.k_l[il],
4687
- n_embd_head_k, n_head_kv, n_ctx,
4688
- ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4689
- ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4690
- 0),
4691
- K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4692
- ext_factor, attn_factor, beta_fast, beta_slow);
4693
- cb(tmp, "K_shifted", il);
4694
- ggml_build_forward_expand(graph, tmp);
4695
- }
4696
- }
4697
-
4698
4816
  static void llm_build_kv_store(
4699
4817
  struct ggml_context * ctx,
4700
4818
  const llama_hparams & hparams,
@@ -4896,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
4896
5014
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4897
5015
  }
4898
5016
 
4899
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
5017
+ #if defined(GGML_USE_KOMPUTE)
5018
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
4901
5019
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
5020
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
5021
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4981,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
4981
5099
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4982
5100
 
4983
5101
  struct ggml_tensor * cur;
5102
+
4984
5103
  cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4985
5104
  q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4986
5105
  cb(cur, "kqv_out", il);
@@ -4998,6 +5117,7 @@ struct llm_build_context {
4998
5117
 
4999
5118
  const int64_t n_embd;
5000
5119
  const int64_t n_layer;
5120
+ const int64_t n_rot;
5001
5121
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
5002
5122
  const int64_t n_head;
5003
5123
  const int64_t n_head_kv;
@@ -5022,8 +5142,8 @@ struct llm_build_context {
5022
5142
  const int32_t kv_head; // index of where we store new KV data in the cache
5023
5143
  const int32_t n_orig_ctx;
5024
5144
 
5025
- const bool do_rope_shift;
5026
- const uint32_t pooling_type;
5145
+ const enum llama_pooling_type pooling_type;
5146
+ const enum llama_rope_type rope_type;
5027
5147
 
5028
5148
  const llm_build_cb & cb;
5029
5149
 
@@ -5045,6 +5165,7 @@ struct llm_build_context {
5045
5165
  kv_self (lctx.kv_self),
5046
5166
  n_embd (hparams.n_embd),
5047
5167
  n_layer (hparams.n_layer),
5168
+ n_rot (hparams.n_rot),
5048
5169
  n_ctx (cparams.n_ctx),
5049
5170
  n_head (hparams.n_head),
5050
5171
  n_head_kv (hparams.n_head_kv),
@@ -5066,8 +5187,8 @@ struct llm_build_context {
5066
5187
  n_kv (worst_case ? n_ctx : kv_self.n),
5067
5188
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5068
5189
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5069
- do_rope_shift (worst_case || kv_self.has_shift),
5070
- pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
5190
+ pooling_type (cparams.pooling_type),
5191
+ rope_type (hparams.rope_type),
5071
5192
  cb (cb),
5072
5193
  buf_compute_meta (lctx.buf_compute_meta) {
5073
5194
  // all initializations should be done in init()
@@ -5090,6 +5211,76 @@ struct llm_build_context {
5090
5211
  }
5091
5212
  }
5092
5213
 
5214
+ struct ggml_cgraph * build_k_shift() {
5215
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5216
+
5217
+ for (int il = 0; il < n_layer; ++il) {
5218
+ struct ggml_tensor * tmp =
5219
+ // we rotate only the first n_rot dimensions
5220
+ ggml_rope_custom_inplace(ctx0,
5221
+ ggml_view_3d(ctx0, kv_self.k_l[il],
5222
+ n_embd_head_k, n_head_kv, n_ctx,
5223
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
5224
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5225
+ 0),
5226
+ lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5227
+ ext_factor, attn_factor, beta_fast, beta_slow);
5228
+ cb(tmp, "K_shifted", il);
5229
+ ggml_build_forward_expand(gf, tmp);
5230
+ }
5231
+
5232
+ return gf;
5233
+ }
5234
+
5235
+ struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5236
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5237
+
5238
+ for (uint32_t i = 0; i < ids.size(); ++i) {
5239
+ const uint32_t id = ids[i];
5240
+
5241
+ if (i == id || id == ids.size()) {
5242
+ continue;
5243
+ }
5244
+
5245
+ uint32_t nm = 1;
5246
+
5247
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
5248
+ nm++;
5249
+ }
5250
+
5251
+ for (int il = 0; il < n_layer; ++il) {
5252
+ ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5253
+ n_embd_k_gqa, nm,
5254
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5255
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5256
+
5257
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5258
+ n_embd_k_gqa, nm,
5259
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5260
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5261
+
5262
+ ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5263
+ nm, n_embd_v_gqa,
5264
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5265
+ ggml_row_size(kv_self.v_l[il]->type, i));
5266
+
5267
+ ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5268
+ nm, n_embd_v_gqa,
5269
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5270
+ ggml_row_size(kv_self.v_l[il]->type, id));
5271
+
5272
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5273
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5274
+ }
5275
+
5276
+ i += nm - 1;
5277
+ }
5278
+
5279
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
5280
+
5281
+ return gf;
5282
+ }
5283
+
5093
5284
  struct ggml_cgraph * build_llama() {
5094
5285
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5095
5286
 
@@ -5111,11 +5302,6 @@ struct llm_build_context {
5111
5302
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5112
5303
  cb(KQ_mask, "KQ_mask", -1);
5113
5304
 
5114
- // shift the entire K-cache if needed
5115
- if (do_rope_shift) {
5116
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5117
- }
5118
-
5119
5305
  for (int il = 0; il < n_layer; ++il) {
5120
5306
  struct ggml_tensor * inpSA = inpL;
5121
5307
 
@@ -5151,14 +5337,14 @@ struct llm_build_context {
5151
5337
 
5152
5338
  Qcur = ggml_rope_custom(
5153
5339
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5154
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5340
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5155
5341
  ext_factor, attn_factor, beta_fast, beta_slow
5156
5342
  );
5157
5343
  cb(Qcur, "Qcur", il);
5158
5344
 
5159
5345
  Kcur = ggml_rope_custom(
5160
5346
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5161
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5347
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5162
5348
  ext_factor, attn_factor, beta_fast, beta_slow
5163
5349
  );
5164
5350
  cb(Kcur, "Kcur", il);
@@ -5299,11 +5485,6 @@ struct llm_build_context {
5299
5485
  struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
5486
  cb(KQ_pos, "KQ_pos", -1);
5301
5487
 
5302
- // shift the entire K-cache if needed
5303
- if (do_rope_shift) {
5304
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5305
- }
5306
-
5307
5488
  for (int il = 0; il < n_layer; ++il) {
5308
5489
  struct ggml_tensor * inpSA = inpL;
5309
5490
 
@@ -5327,12 +5508,12 @@ struct llm_build_context {
5327
5508
  case MODEL_7B:
5328
5509
  Qcur = ggml_rope_custom(
5329
5510
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5330
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5511
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5331
5512
  ext_factor, attn_factor, beta_fast, beta_slow
5332
5513
  );
5333
5514
  Kcur = ggml_rope_custom(
5334
5515
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5335
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5516
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5336
5517
  ext_factor, attn_factor, beta_fast, beta_slow
5337
5518
  );
5338
5519
  break;
@@ -5417,11 +5598,6 @@ struct llm_build_context {
5417
5598
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5418
5599
  cb(KQ_mask, "KQ_mask", -1);
5419
5600
 
5420
- // shift the entire K-cache if needed
5421
- if (do_rope_shift) {
5422
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5423
- }
5424
-
5425
5601
  for (int il = 0; il < n_layer; ++il) {
5426
5602
  struct ggml_tensor * attn_norm;
5427
5603
 
@@ -5460,13 +5636,13 @@ struct llm_build_context {
5460
5636
 
5461
5637
  // using mode = 2 for neox mode
5462
5638
  Qcur = ggml_rope_custom(
5463
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5639
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5464
5640
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5465
5641
  );
5466
5642
  cb(Qcur, "Qcur", il);
5467
5643
 
5468
5644
  Kcur = ggml_rope_custom(
5469
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5645
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5470
5646
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5471
5647
  );
5472
5648
  cb(Kcur, "Kcur", il);
@@ -5636,10 +5812,6 @@ struct llm_build_context {
5636
5812
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5637
5813
  cb(KQ_mask, "KQ_mask", -1);
5638
5814
 
5639
- if (do_rope_shift) {
5640
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5641
- }
5642
-
5643
5815
  for (int il = 0; il < n_layer; ++il) {
5644
5816
  struct ggml_tensor * residual = inpL;
5645
5817
 
@@ -5697,7 +5869,7 @@ struct llm_build_context {
5697
5869
 
5698
5870
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5699
5871
  struct ggml_tensor * qrot = ggml_view_3d(
5700
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5872
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5701
5873
  ggml_element_size(tmpq) * n_embd_head,
5702
5874
  ggml_element_size(tmpq) * n_embd_head * n_head,
5703
5875
  0
@@ -5705,7 +5877,7 @@ struct llm_build_context {
5705
5877
  cb(qrot, "qrot", il);
5706
5878
 
5707
5879
  struct ggml_tensor * krot = ggml_view_3d(
5708
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5880
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5709
5881
  ggml_element_size(tmpk) * n_embd_head,
5710
5882
  ggml_element_size(tmpk) * n_embd_head * n_head,
5711
5883
  0
@@ -5714,29 +5886,29 @@ struct llm_build_context {
5714
5886
 
5715
5887
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5716
5888
  struct ggml_tensor * qpass = ggml_view_3d(
5717
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5889
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5718
5890
  ggml_element_size(tmpq) * n_embd_head,
5719
5891
  ggml_element_size(tmpq) * n_embd_head * n_head,
5720
- ggml_element_size(tmpq) * hparams.n_rot
5892
+ ggml_element_size(tmpq) * n_rot
5721
5893
  );
5722
5894
  cb(qpass, "qpass", il);
5723
5895
 
5724
5896
  struct ggml_tensor * kpass = ggml_view_3d(
5725
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5897
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5726
5898
  ggml_element_size(tmpk) * n_embd_head,
5727
5899
  ggml_element_size(tmpk) * n_embd_head * n_head,
5728
- ggml_element_size(tmpk) * hparams.n_rot
5900
+ ggml_element_size(tmpk) * n_rot
5729
5901
  );
5730
5902
  cb(kpass, "kpass", il);
5731
5903
 
5732
5904
  struct ggml_tensor * qrotated = ggml_rope_custom(
5733
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5905
+ ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5734
5906
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5735
5907
  );
5736
5908
  cb(qrotated, "qrotated", il);
5737
5909
 
5738
5910
  struct ggml_tensor * krotated = ggml_rope_custom(
5739
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5911
+ ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5740
5912
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5741
5913
  );
5742
5914
  cb(krotated, "krotated", il);
@@ -5921,6 +6093,7 @@ struct llm_build_context {
5921
6093
 
5922
6094
  const int64_t n_embd_head = hparams.n_embd_head_v;
5923
6095
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6096
+
5924
6097
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5925
6098
 
5926
6099
  struct ggml_tensor * cur;
@@ -5928,9 +6101,10 @@ struct llm_build_context {
5928
6101
 
5929
6102
  // get input vectors with right size
5930
6103
  const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5931
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6104
+
6105
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5932
6106
  struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5933
- struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
6107
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5934
6108
 
5935
6109
  // construct input embeddings (token, type, position)
5936
6110
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5948,39 +6122,38 @@ struct llm_build_context {
5948
6122
  cb(inpL, "inp_norm", -1);
5949
6123
 
5950
6124
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5951
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5952
- cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
6125
+ struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
6126
+ cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
5953
6127
 
5954
6128
  // iterate layers
5955
6129
  for (int il = 0; il < n_layer; ++il) {
5956
6130
  struct ggml_tensor * cur = inpL;
5957
6131
 
6132
+ struct ggml_tensor * Qcur;
6133
+ struct ggml_tensor * Kcur;
6134
+ struct ggml_tensor * Vcur;
6135
+
5958
6136
  // self-attention
5959
6137
  if (model.arch == LLM_ARCH_BERT) {
5960
- struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6138
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5961
6139
  cb(Qcur, "Qcur", il);
5962
6140
 
5963
- struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6141
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5964
6142
  cb(Kcur, "Kcur", il);
5965
6143
 
5966
- struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6144
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5967
6145
  cb(Vcur, "Vcur", il);
5968
6146
 
5969
- // seems like we just need to do this for Q?
5970
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5971
-
5972
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5973
- model.layers[il].wo, model.layers[il].bo,
5974
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5975
- cb(cur, "kqv_out", il);
6147
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6148
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5976
6149
  } else {
5977
6150
  // compute Q and K and RoPE them
5978
6151
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5979
6152
  cb(cur, "wqkv", il);
5980
6153
 
5981
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5982
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5983
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6154
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6155
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6156
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5984
6157
 
5985
6158
  cb(Qcur, "Qcur", il);
5986
6159
  cb(Kcur, "Kcur", il);
@@ -5988,24 +6161,52 @@ struct llm_build_context {
5988
6161
 
5989
6162
  Qcur = ggml_rope_custom(
5990
6163
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6164
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5992
6165
  ext_factor, attn_factor, beta_fast, beta_slow
5993
6166
  );
5994
6167
  cb(Qcur, "Qcur", il);
5995
6168
 
5996
6169
  Kcur = ggml_rope_custom(
5997
6170
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6171
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5999
6172
  ext_factor, attn_factor, beta_fast, beta_slow
6000
6173
  );
6001
6174
  cb(Kcur, "Kcur", il);
6175
+ }
6002
6176
 
6003
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6004
- model.layers[il].wo, model.layers[il].bo,
6005
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6006
- cb(cur, "kqv_out", il);
6177
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
6178
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
6179
+
6180
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
6181
+ cb(kq, "kq", il);
6182
+
6183
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
6184
+ cb(kq, "kq_soft_max_ext", il);
6185
+
6186
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
6187
+ cb(v, "v", il);
6188
+
6189
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
6190
+ cb(kqv, "kqv", il);
6191
+
6192
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
6193
+ cb(kqv_merged, "kqv_merged", il);
6194
+
6195
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
6196
+ cb(cur, "kqv_merged_cont", il);
6197
+
6198
+ ggml_build_forward_expand(gf, cur);
6199
+
6200
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6201
+ if (model.layers[il].bo) {
6202
+ cb(cur, "kqv_wo", il);
6007
6203
  }
6008
6204
 
6205
+ if (model.layers[il].bo) {
6206
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
6207
+ }
6208
+ cb(cur, "kqv_out", il);
6209
+
6009
6210
  // re-add the layer input
6010
6211
  cur = ggml_add(ctx0, cur, inpL);
6011
6212
 
@@ -6045,16 +6246,29 @@ struct llm_build_context {
6045
6246
 
6046
6247
  // final output
6047
6248
  cur = inpL;
6249
+ cb(cur, "result_embd", -1);
6048
6250
 
6049
6251
  // pooling layer
6050
- if (pooling_type == LLAMA_POOLING_MEAN) {
6051
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
- } else if (pooling_type == LLAMA_POOLING_CLS) {
6053
- cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
- } else {
6055
- GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6252
+ switch (pooling_type) {
6253
+ case LLAMA_POOLING_TYPE_NONE:
6254
+ {
6255
+ // nop
6256
+ } break;
6257
+ case LLAMA_POOLING_TYPE_MEAN:
6258
+ {
6259
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6260
+ cb(cur, "result_embd_pooled", -1);
6261
+ } break;
6262
+ case LLAMA_POOLING_TYPE_CLS:
6263
+ {
6264
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6265
+ cb(cur, "result_embd_pooled", -1);
6266
+ } break;
6267
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
6268
+ {
6269
+ GGML_ASSERT(false && "Invalid pooling type");
6270
+ } break;
6056
6271
  }
6057
- cb(cur, "result_embd", -1);
6058
6272
 
6059
6273
  ggml_build_forward_expand(gf, cur);
6060
6274
 
@@ -6284,11 +6498,6 @@ struct llm_build_context {
6284
6498
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6285
6499
  cb(KQ_mask, "KQ_mask", -1);
6286
6500
 
6287
- // shift the entire K-cache if needed
6288
- if (do_rope_shift) {
6289
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6290
- }
6291
-
6292
6501
  for (int il = 0; il < n_layer; ++il) {
6293
6502
  struct ggml_tensor * inpSA = inpL;
6294
6503
 
@@ -6325,14 +6534,14 @@ struct llm_build_context {
6325
6534
 
6326
6535
  Qcur = ggml_rope_custom(
6327
6536
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6328
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6537
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6329
6538
  ext_factor, attn_factor, beta_fast, beta_slow
6330
6539
  );
6331
6540
  cb(Qcur, "Qcur", il);
6332
6541
 
6333
6542
  Kcur = ggml_rope_custom(
6334
6543
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6335
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6544
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6336
6545
  ext_factor, attn_factor, beta_fast, beta_slow
6337
6546
  );
6338
6547
  cb(Kcur, "Kcur", il);
@@ -6407,11 +6616,6 @@ struct llm_build_context {
6407
6616
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6408
6617
  cb(KQ_mask, "KQ_mask", -1);
6409
6618
 
6410
- // shift the entire K-cache if needed
6411
- if (do_rope_shift) {
6412
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6413
- }
6414
-
6415
6619
  for (int il = 0; il < n_layer; ++il) {
6416
6620
  struct ggml_tensor * inpSA = inpL;
6417
6621
 
@@ -6441,13 +6645,13 @@ struct llm_build_context {
6441
6645
 
6442
6646
  // using mode = 2 for neox mode
6443
6647
  Qcur = ggml_rope_custom(
6444
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6648
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6445
6649
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6446
6650
  );
6447
6651
  cb(Qcur, "Qcur", il);
6448
6652
 
6449
6653
  Kcur = ggml_rope_custom(
6450
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6654
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6451
6655
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6452
6656
  );
6453
6657
  cb(Kcur, "Kcur", il);
@@ -6521,11 +6725,6 @@ struct llm_build_context {
6521
6725
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6522
6726
  cb(KQ_mask, "KQ_mask", -1);
6523
6727
 
6524
- // shift the entire K-cache if needed
6525
- if (do_rope_shift) {
6526
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6527
- }
6528
-
6529
6728
  for (int il = 0; il < n_layer; ++il) {
6530
6729
  struct ggml_tensor * inpSA = inpL;
6531
6730
 
@@ -6561,14 +6760,14 @@ struct llm_build_context {
6561
6760
 
6562
6761
  Qcur = ggml_rope_custom(
6563
6762
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6564
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6763
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6565
6764
  ext_factor, attn_factor, beta_fast, beta_slow
6566
6765
  );
6567
6766
  cb(Qcur, "Qcur", il);
6568
6767
 
6569
6768
  Kcur = ggml_rope_custom(
6570
6769
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6571
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6770
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6572
6771
  ext_factor, attn_factor, beta_fast, beta_slow
6573
6772
  );
6574
6773
  cb(Kcur, "Kcur", il);
@@ -6642,11 +6841,6 @@ struct llm_build_context {
6642
6841
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6643
6842
  cb(KQ_mask, "KQ_mask", -1);
6644
6843
 
6645
- // shift the entire K-cache if needed
6646
- if (do_rope_shift) {
6647
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6648
- }
6649
-
6650
6844
  for (int il = 0; il < n_layer; ++il) {
6651
6845
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
6652
6846
  model.layers[il].attn_norm,
@@ -6684,7 +6878,7 @@ struct llm_build_context {
6684
6878
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6685
6879
 
6686
6880
  Qcur = ggml_rope_custom(
6687
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6881
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6688
6882
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6689
6883
  );
6690
6884
  cb(Qcur, "Qcur", il);
@@ -6695,7 +6889,7 @@ struct llm_build_context {
6695
6889
  cb(Qcur, "Qcur", il);
6696
6890
 
6697
6891
  Kcur = ggml_rope_custom(
6698
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6892
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6699
6893
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6700
6894
  );
6701
6895
  cb(Kcur, "Kcur", il);
@@ -6764,11 +6958,6 @@ struct llm_build_context {
6764
6958
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6765
6959
  cb(KQ_mask, "KQ_mask", -1);
6766
6960
 
6767
- // shift the entire K-cache if needed
6768
- if (do_rope_shift) {
6769
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6770
- }
6771
-
6772
6961
  for (int il = 0; il < n_layer; ++il) {
6773
6962
 
6774
6963
  // norm
@@ -6792,14 +6981,14 @@ struct llm_build_context {
6792
6981
  cb(Vcur, "Vcur", il);
6793
6982
 
6794
6983
  Qcur = ggml_rope_custom(
6795
- ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
6796
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6984
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
6985
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6797
6986
  ext_factor, attn_factor, beta_fast, beta_slow);
6798
6987
  cb(Qcur, "Qcur", il);
6799
6988
 
6800
6989
  Kcur = ggml_rope_custom(
6801
- ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
6802
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6990
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
6991
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6803
6992
  ext_factor, attn_factor, beta_fast, beta_slow);
6804
6993
  cb(Kcur, "Kcur", il);
6805
6994
 
@@ -6969,11 +7158,6 @@ struct llm_build_context {
6969
7158
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6970
7159
  cb(KQ_mask, "KQ_mask", -1);
6971
7160
 
6972
- // shift the entire K-cache if needed
6973
- if (do_rope_shift) {
6974
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6975
- }
6976
-
6977
7161
  for (int il = 0; il < n_layer; ++il) {
6978
7162
  cur = llm_build_norm(ctx0, inpL, hparams,
6979
7163
  model.layers[il].attn_norm,
@@ -6999,14 +7183,14 @@ struct llm_build_context {
6999
7183
 
7000
7184
  struct ggml_tensor * Qcur = ggml_rope_custom(
7001
7185
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
7002
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7186
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7003
7187
  ext_factor, attn_factor, beta_fast, beta_slow
7004
7188
  );
7005
7189
  cb(Qcur, "Qcur", il);
7006
7190
 
7007
7191
  struct ggml_tensor * Kcur = ggml_rope_custom(
7008
7192
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
7009
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7193
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7010
7194
  ext_factor, attn_factor, beta_fast, beta_slow
7011
7195
  );
7012
7196
  cb(Kcur, "Kcur", il);
@@ -7077,11 +7261,6 @@ struct llm_build_context {
7077
7261
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7078
7262
  cb(KQ_mask, "KQ_mask", -1);
7079
7263
 
7080
- // shift the entire K-cache if needed
7081
- if (do_rope_shift) {
7082
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7083
- }
7084
-
7085
7264
  for (int il = 0; il < n_layer; ++il) {
7086
7265
  struct ggml_tensor * inpSA = inpL;
7087
7266
 
@@ -7117,14 +7296,14 @@ struct llm_build_context {
7117
7296
 
7118
7297
  Qcur = ggml_rope_custom(
7119
7298
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7120
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7299
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7121
7300
  ext_factor, attn_factor, beta_fast, beta_slow
7122
7301
  );
7123
7302
  cb(Qcur, "Qcur", il);
7124
7303
 
7125
7304
  Kcur = ggml_rope_custom(
7126
7305
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7127
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7306
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7128
7307
  ext_factor, attn_factor, beta_fast, beta_slow
7129
7308
  );
7130
7309
  cb(Kcur, "Kcur", il);
@@ -7196,11 +7375,6 @@ struct llm_build_context {
7196
7375
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7197
7376
  cb(KQ_mask, "KQ_mask", -1);
7198
7377
 
7199
- // shift the entire K-cache if needed
7200
- if (do_rope_shift) {
7201
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7202
- }
7203
-
7204
7378
  for (int il = 0; il < n_layer; ++il) {
7205
7379
  struct ggml_tensor * inpSA = inpL;
7206
7380
 
@@ -7236,14 +7410,14 @@ struct llm_build_context {
7236
7410
 
7237
7411
  Qcur = ggml_rope_custom(
7238
7412
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7239
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7413
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7240
7414
  ext_factor, attn_factor, beta_fast, beta_slow
7241
7415
  );
7242
7416
  cb(Qcur, "Qcur", il);
7243
7417
 
7244
7418
  Kcur = ggml_rope_custom(
7245
7419
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7246
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7420
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7421
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7422
  );
7249
7423
  cb(Kcur, "Kcur", il);
@@ -7328,11 +7502,6 @@ struct llm_build_context {
7328
7502
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7329
7503
  cb(KQ_mask, "KQ_mask", -1);
7330
7504
 
7331
- // shift the entire K-cache if needed
7332
- if (do_rope_shift) {
7333
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7334
- }
7335
-
7336
7505
  for (int il = 0; il < n_layer; ++il) {
7337
7506
  struct ggml_tensor * inpSA = inpL;
7338
7507
 
@@ -7368,14 +7537,14 @@ struct llm_build_context {
7368
7537
 
7369
7538
  Qcur = ggml_rope_custom(
7370
7539
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7371
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7540
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7372
7541
  ext_factor, attn_factor, beta_fast, beta_slow
7373
7542
  );
7374
7543
  cb(Qcur, "Qcur", il);
7375
7544
 
7376
7545
  Kcur = ggml_rope_custom(
7377
7546
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7378
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7547
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7379
7548
  ext_factor, attn_factor, beta_fast, beta_slow
7380
7549
  );
7381
7550
  cb(Kcur, "Kcur", il);
@@ -7464,11 +7633,6 @@ struct llm_build_context {
7464
7633
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
7634
  cb(KQ_mask, "KQ_mask", -1);
7466
7635
 
7467
- // shift the entire K-cache if needed
7468
- if (do_rope_shift) {
7469
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
- }
7471
-
7472
7636
  for (int il = 0; il < n_layer; ++il) {
7473
7637
 
7474
7638
  // norm
@@ -7491,7 +7655,7 @@ struct llm_build_context {
7491
7655
 
7492
7656
  Qcur = ggml_rope_custom(
7493
7657
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7658
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7495
7659
  ext_factor, attn_factor, beta_fast, beta_slow);
7496
7660
  cb(Qcur, "Qcur", il);
7497
7661
 
@@ -7500,7 +7664,7 @@ struct llm_build_context {
7500
7664
 
7501
7665
  Kcur = ggml_rope_custom(
7502
7666
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7667
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7504
7668
  ext_factor, attn_factor, beta_fast, beta_slow);
7505
7669
  cb(Kcur, "Kcur", il);
7506
7670
 
@@ -7551,33 +7715,181 @@ struct llm_build_context {
7551
7715
 
7552
7716
  return gf;
7553
7717
  }
7554
- };
7555
-
7556
- static struct ggml_cgraph * llama_build_graph(
7557
- llama_context & lctx,
7558
- const llama_batch & batch,
7559
- bool worst_case) {
7560
- const auto & model = lctx.model;
7561
7718
 
7562
- // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7563
- llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7564
- if (il >= 0) {
7565
- ggml_format_name(cur, "%s-%d", name, il);
7566
- } else {
7567
- ggml_set_name(cur, name);
7568
- }
7719
+ struct ggml_cgraph * build_starcoder2() {
7720
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7569
7721
 
7570
- if (!lctx.cparams.offload_kqv) {
7571
- if (strcmp(name, "kqv_merged_cont") == 0) {
7572
- // all nodes between the KV store and the attention output are run on the CPU
7573
- ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
7574
- }
7575
- }
7576
- };
7722
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7723
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7724
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7577
7725
 
7578
- struct ggml_cgraph * result = NULL;
7726
+ struct ggml_tensor * cur;
7727
+ struct ggml_tensor * inpL;
7579
7728
 
7580
- struct llm_build_context llm(lctx, batch, cb, worst_case);
7729
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7730
+ cb(inpL, "inp_embd", -1);
7731
+
7732
+ // inp_pos - contains the positions
7733
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7734
+ cb(inp_pos, "inp_pos", -1);
7735
+
7736
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7737
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7738
+ cb(KQ_mask, "KQ_mask", -1);
7739
+
7740
+ for (int il = 0; il < n_layer; ++il) {
7741
+ struct ggml_tensor * inpSA = inpL;
7742
+
7743
+ // norm
7744
+ cur = llm_build_norm(ctx0, inpL, hparams,
7745
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
7746
+ LLM_NORM, cb, il);
7747
+ cb(cur, "attn_norm", il);
7748
+
7749
+ // self-attention
7750
+ {
7751
+ // compute Q and K and RoPE them
7752
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7753
+ cb(Qcur, "Qcur", il);
7754
+ if (model.layers[il].bq) {
7755
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7756
+ cb(Qcur, "Qcur", il);
7757
+ }
7758
+
7759
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7760
+ cb(Kcur, "Kcur", il);
7761
+ if (model.layers[il].bk) {
7762
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7763
+ cb(Kcur, "Kcur", il);
7764
+ }
7765
+
7766
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7767
+ cb(Vcur, "Vcur", il);
7768
+ if (model.layers[il].bv) {
7769
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7770
+ cb(Vcur, "Vcur", il);
7771
+ }
7772
+
7773
+ Qcur = ggml_rope_custom(
7774
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7775
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7776
+ ext_factor, attn_factor, beta_fast, beta_slow
7777
+ );
7778
+ cb(Qcur, "Qcur", il);
7779
+
7780
+ Kcur = ggml_rope_custom(
7781
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7782
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7783
+ ext_factor, attn_factor, beta_fast, beta_slow
7784
+ );
7785
+ cb(Kcur, "Kcur", il);
7786
+
7787
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7788
+ model.layers[il].wo, model.layers[il].bo,
7789
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7790
+ cb(cur, "kqv_out", il);
7791
+ }
7792
+
7793
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7794
+ cb(ffn_inp, "ffn_inp", il);
7795
+
7796
+ // feed-forward network
7797
+
7798
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7799
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(cur, "ffn_norm", il);
7802
+
7803
+ cur = llm_build_ffn(ctx0, cur,
7804
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7805
+ NULL, NULL,
7806
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7807
+ NULL,
7808
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
7809
+ cb(cur, "ffn_out", il);
7810
+ cur = ggml_add(ctx0, cur, ffn_inp);
7811
+ cb(cur, "l_out", il);
7812
+
7813
+ // input for next layer
7814
+ inpL = cur;
7815
+ }
7816
+
7817
+ cur = inpL;
7818
+
7819
+ cur = llm_build_norm(ctx0, cur, hparams,
7820
+ model.output_norm, model.output_norm_b,
7821
+ LLM_NORM, cb, -1);
7822
+ cb(cur, "result_norm", -1);
7823
+
7824
+ // lm_head
7825
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7826
+ cb(cur, "result_output", -1);
7827
+
7828
+ ggml_build_forward_expand(gf, cur);
7829
+
7830
+ return gf;
7831
+ }
7832
+ };
7833
+
7834
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7835
+ llama_batch dummy;
7836
+ dummy.n_tokens = 0;
7837
+
7838
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7839
+
7840
+ struct llm_build_context llm(lctx, dummy, cb, false);
7841
+
7842
+ llm.init();
7843
+
7844
+ struct ggml_cgraph * result = llm.build_defrag(ids);
7845
+
7846
+ llm.free();
7847
+
7848
+ return result;
7849
+ }
7850
+
7851
+ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7852
+ llama_batch dummy;
7853
+ dummy.n_tokens = 0;
7854
+
7855
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7856
+
7857
+ struct llm_build_context llm(lctx, dummy, cb, false);
7858
+
7859
+ llm.init();
7860
+
7861
+ struct ggml_cgraph * result = llm.build_k_shift();
7862
+
7863
+ llm.free();
7864
+
7865
+ return result;
7866
+ }
7867
+
7868
+ static struct ggml_cgraph * llama_build_graph(
7869
+ llama_context & lctx,
7870
+ const llama_batch & batch,
7871
+ bool worst_case) {
7872
+ const auto & model = lctx.model;
7873
+
7874
+ // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7875
+ llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7876
+ if (il >= 0) {
7877
+ ggml_format_name(cur, "%s-%d", name, il);
7878
+ } else {
7879
+ ggml_set_name(cur, name);
7880
+ }
7881
+
7882
+ if (!lctx.cparams.offload_kqv) {
7883
+ if (strcmp(name, "kqv_merged_cont") == 0) {
7884
+ // all nodes between the KV store and the attention output are run on the CPU
7885
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
7886
+ }
7887
+ }
7888
+ };
7889
+
7890
+ struct ggml_cgraph * result = NULL;
7891
+
7892
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
7581
7893
 
7582
7894
  llm.init();
7583
7895
 
@@ -7663,6 +7975,10 @@ static struct ggml_cgraph * llama_build_graph(
7663
7975
  {
7664
7976
  result = llm.build_gemma();
7665
7977
  } break;
7978
+ case LLM_ARCH_STARCODER2:
7979
+ {
7980
+ result = llm.build_starcoder2();
7981
+ } break;
7666
7982
  default:
7667
7983
  GGML_ASSERT(false);
7668
7984
  }
@@ -7672,6 +7988,20 @@ static struct ggml_cgraph * llama_build_graph(
7672
7988
  return result;
7673
7989
  }
7674
7990
 
7991
+ static void llama_set_k_shift(llama_context & lctx) {
7992
+ const auto & cparams = lctx.cparams;
7993
+
7994
+ const int64_t n_ctx = cparams.n_ctx;
7995
+
7996
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7997
+
7998
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7999
+
8000
+ for (int i = 0; i < n_ctx; ++i) {
8001
+ data[i] = lctx.kv_self.cells[i].delta;
8002
+ }
8003
+ }
8004
+
7675
8005
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7676
8006
  //
7677
8007
  // set input data
@@ -7700,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7700
8030
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7701
8031
  }
7702
8032
 
7703
- {
8033
+ if (hparams.causal_attn) {
7704
8034
  const int64_t n_kv = kv_self.n;
7705
8035
  const int64_t n_tokens = batch.n_tokens;
7706
8036
 
@@ -7715,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7715
8045
 
7716
8046
  for (int i = 0; i < n_kv; ++i) {
7717
8047
  float f;
7718
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7719
- (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
8048
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7720
8049
  f = -INFINITY;
7721
8050
  } else {
7722
- f = 0;
8051
+ f = 0.0f;
7723
8052
  }
7724
8053
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7725
8054
  }
7726
8055
  }
7727
8056
  }
8057
+ } else {
8058
+ // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
8059
+ const int64_t n_tokens = batch.n_tokens;
8060
+
8061
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8062
+
8063
+ float * data = (float *) lctx.inp_KQ_mask->data;
8064
+
8065
+ for (int h = 0; h < 1; ++h) {
8066
+ for (int j = 0; j < n_tokens; ++j) {
8067
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8068
+
8069
+ for (int i = 0; i < n_tokens; ++i) {
8070
+ float f = -INFINITY;
8071
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
8072
+ if (batch.seq_id[i][s] == seq_id) {
8073
+ f = 0.0f;
8074
+ break;
8075
+ }
8076
+ }
8077
+
8078
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
8079
+ }
8080
+ }
8081
+ }
7728
8082
  }
7729
8083
 
7730
8084
  if (hparams.need_kq_pos) {
@@ -7739,29 +8093,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7739
8093
  }
7740
8094
  }
7741
8095
 
7742
- if (kv_self.has_shift) {
7743
- const int64_t n_ctx = cparams.n_ctx;
7744
-
7745
- assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7746
-
7747
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7748
-
7749
- for (int i = 0; i < n_ctx; ++i) {
7750
- data[i] = lctx.kv_self.cells[i].delta;
7751
- }
7752
- }
7753
-
7754
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
8096
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7755
8097
  const int64_t n_tokens = batch.n_tokens;
7756
8098
 
7757
8099
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7758
- float * data = (float *) lctx.inp_mean->data;
7759
8100
 
8101
+ float * data = (float *) lctx.inp_mean->data;
7760
8102
  memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7761
8103
 
7762
8104
  std::vector<uint64_t> sum(n_tokens, 0);
7763
8105
  for (int i = 0; i < n_tokens; ++i) {
7764
8106
  const llama_seq_id seq_id = batch.seq_id[i][0];
8107
+
8108
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
8109
+
7765
8110
  sum[seq_id] += 1;
7766
8111
  }
7767
8112
 
@@ -7779,15 +8124,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7779
8124
  }
7780
8125
  }
7781
8126
 
7782
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
8127
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7783
8128
  const int64_t n_tokens = batch.n_tokens;
7784
8129
 
7785
8130
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
8131
+
7786
8132
  uint32_t * data = (uint32_t *) lctx.inp_cls->data;
8133
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
7787
8134
 
7788
8135
  for (int i = 0; i < n_tokens; ++i) {
7789
8136
  const llama_seq_id seq_id = batch.seq_id[i][0];
7790
- const llama_pos pos = batch.pos[i];
8137
+ const llama_pos pos = batch.pos[i];
8138
+
8139
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
8140
+
7791
8141
  if (pos == 0) {
7792
8142
  data[seq_id] = i;
7793
8143
  }
@@ -7795,6 +8145,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
8145
  }
7796
8146
  }
7797
8147
 
8148
+ static void llama_graph_compute(
8149
+ llama_context & lctx,
8150
+ ggml_cgraph * gf,
8151
+ int n_threads) {
8152
+ #ifdef GGML_USE_MPI
8153
+ const int64_t n_layer = lctx.model.hparams.n_layer;
8154
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
8155
+ #endif
8156
+
8157
+ #ifdef GGML_USE_METAL
8158
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
8159
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
8160
+ }
8161
+ #endif
8162
+
8163
+ if (lctx.backend_cpu != nullptr) {
8164
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
8165
+ ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
8166
+ }
8167
+
8168
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
8169
+
8170
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
8171
+
8172
+ #ifdef GGML_USE_MPI
8173
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
8174
+ #endif
8175
+ }
8176
+
7798
8177
  // decode a batch of tokens by evaluating the transformer
7799
8178
  //
7800
8179
  // - lctx: llama context
@@ -7821,9 +8200,9 @@ static int llama_decode_internal(
7821
8200
  const auto n_batch = cparams.n_batch;
7822
8201
 
7823
8202
  GGML_ASSERT(n_tokens <= n_batch);
8203
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7824
8204
 
7825
8205
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
7826
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7827
8206
 
7828
8207
  const int64_t t_start_us = ggml_time_us();
7829
8208
 
@@ -7872,21 +8251,26 @@ static int llama_decode_internal(
7872
8251
  batch.seq_id = seq_id_arr.data();
7873
8252
  }
7874
8253
 
7875
- // if we have enough unused cells before the current head ->
7876
- // better to start searching from the beginning of the cache, hoping to fill it
7877
- if (kv_self.head > kv_self.used + 2*n_tokens) {
7878
- kv_self.head = 0;
7879
- }
8254
+ // non-causal masks do not use the KV cache
8255
+ if (hparams.causal_attn) {
8256
+ llama_kv_cache_update(&lctx);
7880
8257
 
7881
- if (!llama_kv_cache_find_slot(kv_self, batch)) {
7882
- return 1;
7883
- }
8258
+ // if we have enough unused cells before the current head ->
8259
+ // better to start searching from the beginning of the cache, hoping to fill it
8260
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
8261
+ kv_self.head = 0;
8262
+ }
7884
8263
 
7885
- // a heuristic, to avoid attending the full cache if it is not yet utilized
7886
- // after enough generations, the benefit from this heuristic disappears
7887
- // if we start defragmenting the cache, the benefit from this will be more important
7888
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
7889
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8264
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
8265
+ return 1;
8266
+ }
8267
+
8268
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8269
+ // after enough generations, the benefit from this heuristic disappears
8270
+ // if we start defragmenting the cache, the benefit from this will be more important
8271
+ kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
8272
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8273
+ }
7890
8274
 
7891
8275
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
7892
8276
 
@@ -7896,19 +8280,26 @@ static int llama_decode_internal(
7896
8280
  ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7897
8281
 
7898
8282
  // the output is always the last tensor in the graph
7899
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7900
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7901
- if (strcmp(res->name, "result_output") == 0) {
7902
- // the embeddings could be the second to last tensor, or the third to last tensor
7903
- if (strcmp(embeddings->name, "result_norm") != 0) {
7904
- embeddings = gf->nodes[gf->n_nodes - 3];
7905
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7906
- }
7907
- } else if (strcmp(res->name, "result_embd") == 0) {
7908
- embeddings = res;
7909
- res = nullptr;
8283
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8284
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8285
+
8286
+ if (!hparams.causal_attn) {
8287
+ res = nullptr; // do not extract logits for embedding models such as BERT
8288
+
8289
+ // token or sequence embeddings
8290
+ embd = gf->nodes[gf->n_nodes - 1];
8291
+
8292
+ GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
7910
8293
  } else {
7911
- GGML_ASSERT(false);
8294
+ if (strcmp(res->name, "result_output") == 0) {
8295
+ // the token embeddings could be the second to last tensor, or the third to last tensor
8296
+ if (strcmp(embd->name, "result_norm") != 0) {
8297
+ embd = gf->nodes[gf->n_nodes - 3];
8298
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8299
+ }
8300
+ } else {
8301
+ GGML_ASSERT(false && "missing result_output tensor");
8302
+ }
7912
8303
  }
7913
8304
 
7914
8305
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7924,40 +8315,12 @@ static int llama_decode_internal(
7924
8315
  n_threads = std::min(4, n_threads);
7925
8316
  }
7926
8317
 
7927
- #ifdef GGML_USE_MPI
7928
- const int64_t n_layer = hparams.n_layer;
7929
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7930
- #endif
7931
-
7932
- #ifdef GGML_USE_METAL
7933
- if (ggml_backend_is_metal(lctx.backend_metal)) {
7934
- ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7935
- }
7936
- #endif
7937
-
7938
- if (lctx.backend_cpu != nullptr) {
7939
- ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7940
- }
7941
-
7942
8318
  llama_set_inputs(lctx, batch);
7943
8319
 
7944
- ggml_backend_sched_graph_compute(lctx.sched, gf);
7945
-
7946
- // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7947
-
7948
- #ifdef GGML_USE_MPI
7949
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7950
- #endif
8320
+ llama_graph_compute(lctx, gf, n_threads);
7951
8321
 
7952
8322
  // update the kv ring buffer
7953
8323
  {
7954
- if (kv_self.has_shift) {
7955
- kv_self.has_shift = false;
7956
- for (uint32_t i = 0; i < kv_self.size; ++i) {
7957
- kv_self.cells[i].delta = 0;
7958
- }
7959
- }
7960
-
7961
8324
  kv_self.head += n_tokens;
7962
8325
 
7963
8326
  // Ensure kv cache head points to a valid index.
@@ -7966,6 +8329,18 @@ static int llama_decode_internal(
7966
8329
  }
7967
8330
  }
7968
8331
 
8332
+ // decide if we need to defrag the kv cache
8333
+ if (cparams.defrag_thold >= 0.0f) {
8334
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8335
+
8336
+ // queue defragmentation for next llama_kv_cache_update
8337
+ if (fragmentation > cparams.defrag_thold) {
8338
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8339
+
8340
+ llama_kv_cache_defrag(kv_self);
8341
+ }
8342
+ }
8343
+
7969
8344
  #ifdef GGML_PERF
7970
8345
  // print timing information per ggml operation (for debugging purposes)
7971
8346
  // requires GGML_PERF to be defined
@@ -7991,66 +8366,341 @@ static int llama_decode_internal(
7991
8366
  logits_out.clear();
7992
8367
  #endif
7993
8368
 
7994
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
7995
- GGML_ASSERT(res_backend != nullptr);
8369
+ ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
8370
+ GGML_ASSERT(backend_res != nullptr);
8371
+
7996
8372
  if (batch.logits) {
7997
8373
  logits_out.resize(n_vocab * n_tokens);
7998
8374
  for (uint32_t i = 0; i < n_tokens; i++) {
7999
8375
  if (batch.logits[i] == 0) {
8000
8376
  continue;
8001
8377
  }
8002
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8378
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8003
8379
  #ifndef NDEBUG
8004
8380
  logits_valid[i] = true;
8005
8381
  #endif
8006
8382
  }
8007
8383
  } else if (lctx.logits_all) {
8008
8384
  logits_out.resize(n_vocab * n_tokens);
8009
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8385
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8010
8386
  #ifndef NDEBUG
8011
8387
  std::fill(logits_valid.begin(), logits_valid.end(), true);
8012
8388
  #endif
8013
8389
  } else {
8014
8390
  logits_out.resize(n_vocab);
8015
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8391
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8016
8392
  #ifndef NDEBUG
8017
8393
  logits_valid[0] = true;
8018
8394
  #endif
8019
8395
  }
8020
- ggml_backend_synchronize(res_backend);
8021
- }
8396
+ ggml_backend_synchronize(backend_res);
8397
+ }
8398
+
8399
+ // extract embeddings
8400
+ if (cparams.embeddings && embd) {
8401
+ ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
8402
+ GGML_ASSERT(backend_embd != nullptr);
8403
+
8404
+ switch (cparams.pooling_type) {
8405
+ case LLAMA_POOLING_TYPE_NONE:
8406
+ {
8407
+ // extract token embeddings
8408
+ auto & embd_out = lctx.embd;
8409
+
8410
+ if (batch.logits) {
8411
+ embd_out.resize(n_embd * n_tokens);
8412
+ for (uint32_t i = 0; i < n_tokens; i++) {
8413
+ if (batch.logits[i] == 0) {
8414
+ continue;
8415
+ }
8416
+
8417
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
8418
+ }
8419
+ }
8420
+ } break;
8421
+ case LLAMA_POOLING_TYPE_CLS:
8422
+ case LLAMA_POOLING_TYPE_MEAN:
8423
+ {
8424
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
8425
+
8426
+ // extract sequence embeddings
8427
+ auto & embd_seq_out = lctx.embd_seq;
8428
+ embd_seq_out.clear();
8429
+
8430
+ for (uint32_t i = 0; i < n_tokens; i++) {
8431
+ const llama_seq_id seq_id = batch.seq_id[i][0];
8432
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
8433
+ continue;
8434
+ }
8435
+ embd_seq_out[seq_id].resize(n_embd);
8436
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
8437
+ }
8438
+ } break;
8439
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
8440
+ {
8441
+ GGML_ASSERT(false && "unknown pooling type");
8442
+ } break;
8443
+ }
8444
+ ggml_backend_synchronize(backend_embd);
8445
+ }
8446
+
8447
+ // measure the performance only for the single-token evals
8448
+ if (n_tokens == 1) {
8449
+ lctx.t_eval_us += ggml_time_us() - t_start_us;
8450
+ lctx.n_eval++;
8451
+ }
8452
+ else if (n_tokens > 1) {
8453
+ lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8454
+ lctx.n_p_eval += n_tokens;
8455
+ }
8456
+
8457
+ // get a more accurate load time, upon first eval
8458
+ // TODO: fix this
8459
+ if (!lctx.has_evaluated_once) {
8460
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8461
+ lctx.has_evaluated_once = true;
8462
+ }
8463
+
8464
+ return 0;
8465
+ }
8466
+
8467
+ // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8468
+ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8469
+ auto & kv_self = lctx.kv_self;
8470
+
8471
+ const auto & hparams = lctx.model.hparams;
8472
+
8473
+ const uint32_t n_layer = hparams.n_layer;
8474
+
8475
+ const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8476
+ const uint32_t n_used = kv_self.used;
8477
+
8478
+ assert(n_used <= n_kv);
8479
+
8480
+ //const int64_t t_start = ggml_time_us();
8481
+
8482
+ // number of cells moved
8483
+ uint32_t n_moves = 0;
8484
+
8485
+ // determine which KV cells to move where
8486
+ //
8487
+ // cell i moves to ids[i]
8488
+ //
8489
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
8490
+ //
8491
+ std::vector<uint32_t> ids(n_kv, n_kv);
8492
+
8493
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
8494
+ const auto & cell0 = kv_self.cells[i0];
8495
+
8496
+ if (!cell0.is_empty()) {
8497
+ ids[i0] = i0;
8498
+
8499
+ continue;
8500
+ }
8501
+
8502
+ // found a hole - fill it with data from the end of the cache
8503
+
8504
+ uint32_t nh = 1;
8505
+
8506
+ // determine the size of the hole
8507
+ while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
8508
+ nh++;
8509
+ }
8510
+
8511
+ // each move requires 6*n_layer tensors (see build_defrag)
8512
+ // - source view, destination view, copy operation
8513
+ // - x2 for keys and values
8514
+ //
8515
+ if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8516
+ // the graph is too big, we cannot move more cells
8517
+ break;
8518
+ }
8519
+
8520
+ uint32_t nf = 0;
8521
+ uint32_t is = n_kv - 1;
8522
+
8523
+ // starting from the end, find nh non-empty cells
8524
+ for (; is > i0; --is) {
8525
+ const auto & cell1 = kv_self.cells[is];
8526
+
8527
+ if (cell1.is_empty() || ids[is] != n_kv) {
8528
+ continue;
8529
+ }
8530
+
8531
+ // non-empty cell which is not yet moved
8532
+ nf++;
8533
+
8534
+ if (nf == nh) {
8535
+ break;
8536
+ }
8537
+ }
8538
+
8539
+ // this can only happen if `n_used` is not accurate, which would be a bug
8540
+ GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
8541
+
8542
+ nf = 0;
8543
+
8544
+ uint32_t i1 = is;
8545
+
8546
+ // are we moving a continuous block of memory?
8547
+ bool cont = false;
8548
+
8549
+ // go back and move the nf cells to the hole
8550
+ for (; i1 < n_kv; ++i1) {
8551
+ auto & cell1 = kv_self.cells[i1];
8552
+
8553
+ if (cell1.is_empty() || ids[i1] != n_kv) {
8554
+ cont = false;
8555
+ continue;
8556
+ }
8557
+
8558
+ // this cell goes to (i0 + nf)
8559
+ ids[i1] = i0 + nf;
8560
+
8561
+ // move the cell meta data
8562
+ kv_self.cells[i0 + nf] = cell1;
8563
+
8564
+ // clear the old cell and move the head there
8565
+ cell1 = llama_kv_cell();
8566
+ kv_self.head = n_used;
8567
+
8568
+ if (!cont) {
8569
+ n_moves++;
8570
+ cont = true;
8571
+ }
8572
+
8573
+ nf++;
8574
+
8575
+ if (nf == nh) {
8576
+ break;
8577
+ }
8578
+ }
8579
+
8580
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8581
+
8582
+ i0 += nh - 1;
8583
+ }
8584
+
8585
+ if (n_moves == 0) {
8586
+ return;
8587
+ }
8588
+
8589
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8590
+
8591
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
8592
+
8593
+ #if 0
8594
+ // CPU defrag
8595
+ //
8596
+ // TODO: optimizations are possible:
8597
+ // - multiple threads
8598
+ // - avoid copying to the host memory when already there
8599
+ //
8600
+ // likely not worth the effort, as we have ggml_graph based defrag
8601
+ //
8602
+
8603
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8604
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8605
+
8606
+ const uint32_t kv_size = kv_self.size;
8607
+
8608
+ std::vector<uint8_t> buf_k;
8609
+ std::vector<uint8_t> buf_v;
8610
+
8611
+ for (uint32_t il = 0; il < n_layer; ++il) {
8612
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
8613
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
8614
+
8615
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
8616
+ const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
8617
+
8618
+ buf_k.resize(k_size);
8619
+ buf_v.resize(v_size);
8620
+
8621
+ ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8622
+ ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8623
+
8624
+ // batch move [i, i+nm) to [id, id+nm)
8625
+ // note: cells can move only to a lower index
8626
+ for (uint32_t i = 0; i < n_kv; ++i) {
8627
+ const uint32_t id = ids[i];
8628
+
8629
+ if (i == id || id == n_kv) {
8630
+ continue;
8631
+ }
8632
+
8633
+ uint32_t nm = 1;
8634
+
8635
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
8636
+ nm++;
8637
+ }
8638
+
8639
+ // move keys
8640
+ {
8641
+ const int64_t os = i*k_size_row;
8642
+ const int64_t od = id*k_size_row;
8643
+
8644
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
8645
+ }
8646
+
8647
+ // move values (note: they are transposed)
8648
+ {
8649
+ const int64_t os = i;
8650
+ const int64_t od = id;
8651
+
8652
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
8653
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
8654
+ }
8655
+ }
8656
+
8657
+ i += nm - 1;
8658
+ }
8659
+
8660
+ ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8661
+ ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8662
+ }
8663
+ #else
8664
+ // ggml_graph defrag
8665
+
8666
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8667
+
8668
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8669
+ #endif
8670
+
8671
+ //const int64_t t_end = ggml_time_us();
8672
+
8673
+ //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8674
+ }
8675
+
8676
+ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8677
+ // apply K-shift if needed
8678
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8679
+ llama_set_k_shift(lctx);
8680
+
8681
+ {
8682
+ ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
8683
+
8684
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8685
+ }
8022
8686
 
8023
- // extract embeddings
8024
- if (!lctx.embedding.empty()) {
8025
- auto & embedding_out = lctx.embedding;
8687
+ {
8688
+ auto & kv_self = lctx.kv_self;
8026
8689
 
8027
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8028
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8690
+ kv_self.has_shift = false;
8029
8691
 
8030
- embedding_out.resize(embd_size);
8031
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8032
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8033
- ggml_backend_synchronize(embeddings_backend);
8692
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
8693
+ kv_self.cells[i].delta = 0;
8694
+ }
8695
+ }
8034
8696
  }
8035
8697
 
8036
- // measure the performance only for the single-token evals
8037
- if (n_tokens == 1) {
8038
- lctx.t_eval_us += ggml_time_us() - t_start_us;
8039
- lctx.n_eval++;
8040
- }
8041
- else if (n_tokens > 1) {
8042
- lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8043
- lctx.n_p_eval += n_tokens;
8044
- }
8698
+ // defragment the KV cache if needed
8699
+ if (lctx.kv_self.do_defrag) {
8700
+ llama_kv_cache_defrag_internal(lctx);
8045
8701
 
8046
- // get a more accurate load time, upon first eval
8047
- // TODO: fix this
8048
- if (!lctx.has_evaluated_once) {
8049
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8050
- lctx.has_evaluated_once = true;
8702
+ lctx.kv_self.do_defrag = false;
8051
8703
  }
8052
-
8053
- return 0;
8054
8704
  }
8055
8705
 
8056
8706
  //
@@ -8085,19 +8735,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
8085
8735
  GGML_ASSERT(llama_is_byte_token(vocab, id));
8086
8736
  const auto& token_data = vocab.id_to_token.at(id);
8087
8737
  switch (llama_vocab_get_type(vocab)) {
8088
- case LLAMA_VOCAB_TYPE_SPM: {
8089
- auto buf = token_data.text.substr(3, 2);
8090
- return strtol(buf.c_str(), NULL, 16);
8091
- }
8092
- case LLAMA_VOCAB_TYPE_BPE: {
8093
- GGML_ASSERT(false);
8094
- return unicode_to_bytes_bpe(token_data.text);
8095
- }
8096
- case LLAMA_VOCAB_TYPE_WPM: {
8097
- GGML_ASSERT(false);
8098
- }
8099
- default:
8100
- GGML_ASSERT(false);
8738
+ case LLAMA_VOCAB_TYPE_SPM: {
8739
+ auto buf = token_data.text.substr(3, 2);
8740
+ return strtol(buf.c_str(), NULL, 16);
8741
+ }
8742
+ case LLAMA_VOCAB_TYPE_BPE: {
8743
+ GGML_ASSERT(false);
8744
+ return unicode_to_bytes_bpe(token_data.text);
8745
+ }
8746
+ case LLAMA_VOCAB_TYPE_WPM: {
8747
+ GGML_ASSERT(false);
8748
+ }
8749
+ default:
8750
+ GGML_ASSERT(false);
8101
8751
  }
8102
8752
  }
8103
8753
 
@@ -8644,37 +9294,46 @@ struct llm_tokenizer_wpm {
8644
9294
  }
8645
9295
 
8646
9296
  std::vector<std::string> preprocess(const std::string & text) {
8647
- std::string ori_str = normalize(text);
8648
- uint64_t ori_size = ori_str.size();
9297
+ // normalalization form D
9298
+ std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
9299
+ std::vector<uint32_t> nfd_codepoints;
9300
+ for (uint32_t code : codepoints) {
9301
+ auto it = nfd_map.equal_range(code);
9302
+ if (it.first != it.second) {
9303
+ for (auto jt = it.first; jt != it.second; jt++) {
9304
+ nfd_codepoints.push_back(jt->second);
9305
+ }
9306
+ } else {
9307
+ nfd_codepoints.push_back(code);
9308
+ }
9309
+ }
8649
9310
 
8650
- // single punct / single symbol / single digit
8651
- // baseline: add whitespace on the left and right of punct and chinese characters
8652
- std::vector<std::string> words;
9311
+ // strip accents, strip control, uniformize whitespace,
9312
+ // to lowercase, pad chinese characters, pad punctuation
8653
9313
  std::string new_str = "";
8654
- uint64_t i = 0;
8655
- while (i < ori_size) {
8656
- int utf_char_len = utf8_len(ori_str[i]);
8657
- if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8658
- new_str += " ";
8659
- new_str += ori_str[i];
8660
- new_str += " ";
8661
- i += 1;
9314
+ for (uint32_t code : nfd_codepoints) {
9315
+ int type = codepoint_type(code);
9316
+ if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
9317
+ continue;
9318
+ }
9319
+ code = to_lower(code);
9320
+ if (type == CODEPOINT_TYPE_WHITESPACE) {
9321
+ code = ' ';
8662
9322
  }
8663
- else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
9323
+ std::string s = codepoint_to_utf8(code);
9324
+ if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8664
9325
  new_str += " ";
8665
- new_str += ori_str.substr(i, 3);
9326
+ new_str += s;
8666
9327
  new_str += " ";
8667
- i += 3;
8668
- }
8669
- else {
8670
- new_str += ori_str[i];
8671
- i += 1;
9328
+ } else {
9329
+ new_str += s;
8672
9330
  }
8673
9331
  }
8674
9332
 
8675
9333
  // split by whitespace
8676
9334
  uint64_t l = 0;
8677
9335
  uint64_t r = 0;
9336
+ std::vector<std::string> words;
8678
9337
  while (r < new_str.size()) {
8679
9338
  // if is whitespace
8680
9339
  if (isspace(new_str[r])) {
@@ -8692,47 +9351,21 @@ struct llm_tokenizer_wpm {
8692
9351
  return words;
8693
9352
  }
8694
9353
 
8695
- std::string normalize(const std::string & text) {
8696
- // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8697
- std::string text2 = strip_accents(text);
8698
- for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8699
- char c = text2[i];
8700
- if (c >= 'A' && c <= 'Z') {
8701
- text2[i] = c - 'A' + 'a';
8702
- }
9354
+ uint32_t to_lower(uint32_t code) {
9355
+ static const std::locale locale("en_US.UTF-8");
9356
+ #if defined(_WIN32)
9357
+ if (code > 0xFFFF) {
9358
+ return code;
8703
9359
  }
8704
- return text2;
9360
+ #endif
9361
+ return std::tolower(wchar_t(code), locale);
8705
9362
  }
8706
9363
 
8707
- bool is_chinese_char(const std::string & str) {
8708
- int len = str.length();
8709
- unsigned int codepoint = 0;
8710
- int num_bytes = 0;
8711
- int i = 0;
8712
- unsigned char ch = static_cast<unsigned char>(str[i]);
8713
- if (ch <= 0x7f) {
8714
- codepoint = ch;
8715
- num_bytes = 1;
8716
- } else if ((ch >> 5) == 0x06) {
8717
- codepoint = ch & 0x1f;
8718
- num_bytes = 2;
8719
- } else if ((ch >> 4) == 0x0e) {
8720
- codepoint = ch & 0x0f;
8721
- num_bytes = 3;
8722
- } else if ((ch >> 3) == 0x1e) {
8723
- codepoint = ch & 0x07;
8724
- num_bytes = 4;
8725
- }
8726
- for (int j = 1; j < num_bytes; ++j) {
8727
- if (i + j >= len) {
8728
- return false; // incomplete UTF-8 character
8729
- }
8730
- unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8731
- if ((next_ch >> 6) != 0x02) {
8732
- return false; // invalid trailing byte
8733
- }
8734
- codepoint = (codepoint << 6) | (next_ch & 0x3f);
8735
- }
9364
+ bool is_ascii_punct(uint32_t code) {
9365
+ return code < 256 && ispunct(code);
9366
+ }
9367
+
9368
+ bool is_chinese_char(uint32_t codepoint) {
8736
9369
  if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8737
9370
  (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8738
9371
  (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@@ -8748,41 +9381,6 @@ struct llm_tokenizer_wpm {
8748
9381
  return false;
8749
9382
  }
8750
9383
 
8751
- std::string strip_accents(const std::string & input_string) {
8752
- std::string resultString;
8753
- std::map<std::string, char> accent_map = {
8754
- {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8755
- {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8756
- {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8757
- {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8758
- {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8759
- {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8760
- {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8761
- {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8762
- {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8763
- };
8764
-
8765
- for (size_t i = 0; i < input_string.length();) {
8766
- int len = utf8_len(input_string[i]);
8767
- std::string curChar = input_string.substr(i, len);
8768
- auto iter = accent_map.find(curChar);
8769
- if (iter != accent_map.end()) {
8770
- resultString += iter->second;
8771
- } else {
8772
- resultString += curChar;
8773
- }
8774
- i += len;
8775
- }
8776
-
8777
- return resultString;
8778
- }
8779
-
8780
- static size_t utf8_len(char src) {
8781
- const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8782
- uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8783
- return lookup[highbits];
8784
- }
8785
-
8786
9384
  const llama_vocab & vocab;
8787
9385
  };
8788
9386
 
@@ -9816,10 +10414,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
9816
10414
  }
9817
10415
  }
9818
10416
 
9819
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
9820
- llama_sample_temp(ctx, candidates_p, temp);
9821
- }
9822
-
9823
10417
  void llama_sample_repetition_penalties(
9824
10418
  struct llama_context * ctx,
9825
10419
  llama_token_data_array * candidates,
@@ -9946,38 +10540,6 @@ void llama_sample_apply_guidance(
9946
10540
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9947
10541
  }
9948
10542
 
9949
- void llama_sample_classifier_free_guidance(
9950
- struct llama_context * ctx,
9951
- llama_token_data_array * candidates,
9952
- struct llama_context * guidance_ctx,
9953
- float scale) {
9954
- GGML_ASSERT(ctx);
9955
- int64_t t_start_sample_us;
9956
-
9957
- t_start_sample_us = ggml_time_us();
9958
- const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
9959
-
9960
- GGML_ASSERT(n_vocab == candidates->size);
9961
- GGML_ASSERT(!candidates->sorted);
9962
-
9963
- std::vector<float> logits_base(n_vocab);
9964
- for (size_t i = 0; i < n_vocab; ++i) {
9965
- logits_base[i] = candidates->data[i].logit;
9966
- }
9967
-
9968
- float * logits_guidance = llama_get_logits(guidance_ctx);
9969
-
9970
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9971
- llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
9972
- t_start_sample_us = ggml_time_us();
9973
-
9974
- for (size_t i = 0; i < n_vocab; ++i) {
9975
- candidates->data[i].logit = logits_base[i];
9976
- }
9977
-
9978
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9979
- }
9980
-
9981
10543
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
9982
10544
  GGML_ASSERT(ctx);
9983
10545
 
@@ -10411,7 +10973,7 @@ struct quantize_state_internal {
10411
10973
  {}
10412
10974
  };
10413
10975
 
10414
- static void llama_convert_tensor_internal(
10976
+ static void llama_tensor_dequantize_internal(
10415
10977
  struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
10416
10978
  const size_t nelements, const int nthread
10417
10979
  ) {
@@ -10508,31 +11070,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10508
11070
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10509
11071
  new_type = GGML_TYPE_Q8_0;
10510
11072
  }
10511
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11073
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11074
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10512
11075
  new_type = GGML_TYPE_Q5_K;
10513
11076
  }
10514
11077
  else if (new_type != GGML_TYPE_Q8_0) {
10515
11078
  new_type = GGML_TYPE_Q6_K;
10516
11079
  }
10517
11080
  } else if (name == "token_embd.weight") {
10518
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11081
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11082
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10519
11083
  new_type = GGML_TYPE_Q2_K;
10520
11084
  }
11085
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11086
+ new_type = GGML_TYPE_IQ3_S;
11087
+ }
10521
11088
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10522
- new_type = GGML_TYPE_Q4_K;
11089
+ new_type = GGML_TYPE_IQ3_S;
10523
11090
  }
10524
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11091
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11092
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10525
11093
  if (name.find("attn_v.weight") != std::string::npos) {
10526
11094
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10527
- else new_type = GGML_TYPE_Q2_K;
11095
+ else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10528
11096
  ++qs.i_attention_wv;
10529
11097
  }
11098
+ else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
11099
+ new_type = GGML_TYPE_Q4_K;
11100
+ }
10530
11101
  else if (name.find("ffn_down") != std::string::npos) {
10531
- if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
11102
+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
11103
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
11104
+ }
10532
11105
  ++qs.i_ffn_down;
10533
11106
  }
10534
11107
  else if (name.find("attn_output.weight") != std::string::npos) {
10535
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
11108
+ if (qs.model.hparams.n_expert == 8) {
11109
+ new_type = GGML_TYPE_Q5_K;
11110
+ } else {
11111
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
11112
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
11113
+ }
10536
11114
  }
10537
11115
  } else if (name.find("attn_v.weight") != std::string::npos) {
10538
11116
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
@@ -10542,13 +11120,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10542
11120
  new_type = GGML_TYPE_Q4_K;
10543
11121
  }
10544
11122
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10545
- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
11123
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
11124
+ }
11125
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
11126
+ new_type = GGML_TYPE_Q4_K;
11127
+ }
11128
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11129
+ new_type = GGML_TYPE_Q4_K;
11130
+ }
11131
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
11132
+ new_type = GGML_TYPE_Q4_K;
11133
+ }
11134
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11135
+ new_type = GGML_TYPE_Q4_K;
10546
11136
  }
10547
11137
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10548
11138
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10549
11139
  }
10550
11140
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
11141
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
10552
11142
  new_type = GGML_TYPE_Q5_K;
10553
11143
  }
10554
11144
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -10574,14 +11164,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10574
11164
  // TODO: explore better strategies
10575
11165
  new_type = GGML_TYPE_Q8_0;
10576
11166
  }
10577
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10578
- new_type = GGML_TYPE_Q2_K;
11167
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
11168
+ new_type = GGML_TYPE_IQ3_XXS;
11169
+ }
11170
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11171
+ new_type = GGML_TYPE_IQ2_S;
11172
+ }
11173
+ } else if (name.find("attn_q.weight") != std::string::npos) {
11174
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
11175
+ new_type = GGML_TYPE_IQ3_XXS;
11176
+ }
11177
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11178
+ new_type = GGML_TYPE_IQ2_S;
10579
11179
  }
10580
11180
  } else if (name.find("ffn_down") != std::string::npos) {
10581
11181
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10582
11182
  int i_layer = info.first, n_layer = info.second;
10583
11183
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10584
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
11184
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
10585
11185
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10586
11186
  }
10587
11187
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
@@ -10592,6 +11192,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10592
11192
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10593
11193
  : GGML_TYPE_Q3_K;
10594
11194
  }
11195
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
11196
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
11197
+ new_type = GGML_TYPE_Q4_K;
11198
+ }
10595
11199
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10596
11200
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10597
11201
  }
@@ -10603,8 +11207,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10603
11207
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10604
11208
  }
10605
11209
  }
10606
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
- if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
11210
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
11211
+ new_type = GGML_TYPE_Q5_K;
10608
11212
  }
10609
11213
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10610
11214
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@@ -10621,39 +11225,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10621
11225
  } else if (name.find("attn_output.weight") != std::string::npos) {
10622
11226
  if (arch != LLM_ARCH_FALCON) {
10623
11227
  if (qs.model.hparams.n_expert == 8) {
10624
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11228
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10625
11229
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10626
- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
11230
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
11231
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
10627
11232
  new_type = GGML_TYPE_Q5_K;
10628
11233
  }
10629
11234
  } else {
10630
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10631
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10632
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
10633
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
11235
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
11236
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
11237
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
11238
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
11239
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
10634
11240
  }
10635
11241
  } else {
10636
11242
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10637
11243
  }
10638
11244
  }
10639
11245
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10640
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
11246
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11247
+ new_type = GGML_TYPE_Q4_K;
11248
+ }
10641
11249
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10642
11250
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10643
11251
  }
10644
11252
  else if (name.find("ffn_gate") != std::string::npos) {
10645
11253
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10646
11254
  int i_layer = info.first, n_layer = info.second;
10647
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10648
- new_type = GGML_TYPE_Q2_K;
11255
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
11256
+ new_type = GGML_TYPE_IQ3_XXS;
10649
11257
  }
10650
11258
  ++qs.i_ffn_gate;
10651
11259
  }
10652
11260
  else if (name.find("ffn_up") != std::string::npos) {
10653
11261
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10654
11262
  int i_layer = info.first, n_layer = info.second;
10655
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10656
- new_type = GGML_TYPE_Q2_K;
11263
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
11264
+ new_type = GGML_TYPE_IQ3_XXS;
10657
11265
  }
10658
11266
  ++qs.i_ffn_up;
10659
11267
  }
@@ -10671,9 +11279,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10671
11279
  //}
10672
11280
  bool convert_incompatible_tensor = false;
10673
11281
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10674
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10675
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10676
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11282
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
11283
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
11284
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
10677
11285
  int nx = tensor->ne[0];
10678
11286
  int ny = tensor->ne[1];
10679
11287
  if (nx % QK_K != 0) {
@@ -10687,13 +11295,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10687
11295
  switch (new_type) {
10688
11296
  case GGML_TYPE_IQ2_XXS:
10689
11297
  case GGML_TYPE_IQ2_XS:
11298
+ case GGML_TYPE_IQ2_S:
10690
11299
  case GGML_TYPE_IQ3_XXS:
11300
+ case GGML_TYPE_IQ3_S:
10691
11301
  case GGML_TYPE_IQ1_S:
10692
11302
  case GGML_TYPE_Q2_K:
10693
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10694
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10695
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10696
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
11303
+ case GGML_TYPE_Q3_K:
11304
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
11305
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
11306
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
11307
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10697
11308
  default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
10698
11309
  }
10699
11310
  LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
@@ -10703,6 +11314,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10703
11314
  return new_type;
10704
11315
  }
10705
11316
 
11317
+ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
11318
+ std::mutex mutex;
11319
+ int counter = 0;
11320
+ size_t new_size = 0;
11321
+ if (nthread < 2) {
11322
+ // single-thread
11323
+ return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
11324
+ }
11325
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
11326
+ nrows, n_per_row, imatrix]() {
11327
+ std::array<int64_t, 1 << 4> local_hist = {};
11328
+ const int nrows_per_chunk = chunk_size / n_per_row;
11329
+ size_t local_size = 0;
11330
+ while (true) {
11331
+ std::unique_lock<std::mutex> lock(mutex);
11332
+ int first_row = counter; counter += nrows_per_chunk;
11333
+ if (first_row >= nrows) {
11334
+ if (local_size > 0) {
11335
+ for (int j=0; j<int(local_hist.size()); ++j) {
11336
+ hist_cur[j] += local_hist[j];
11337
+ }
11338
+ new_size += local_size;
11339
+ }
11340
+ break;
11341
+ }
11342
+ lock.unlock();
11343
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
11344
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
11345
+ first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
11346
+ }
11347
+ };
11348
+ for (int it = 0; it < nthread - 1; ++it) {
11349
+ workers.emplace_back(compute);
11350
+ }
11351
+ compute();
11352
+ for (auto & w : workers) { w.join(); }
11353
+ workers.clear();
11354
+ return new_size;
11355
+ }
11356
+
10706
11357
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
10707
11358
  ggml_type quantized_type;
10708
11359
  llama_ftype ftype = params->ftype;
@@ -10719,7 +11370,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10719
11370
  // K-quants
10720
11371
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10721
11372
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10722
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
11373
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
10723
11374
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10724
11375
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10725
11376
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -10730,9 +11381,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10730
11381
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10731
11382
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10732
11383
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
11384
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
11385
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10733
11386
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
11387
  case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
11388
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
11389
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
11390
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
11391
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
10736
11392
 
10737
11393
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10738
11394
  }
@@ -10810,7 +11466,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10810
11466
 
10811
11467
  std::vector<std::thread> workers;
10812
11468
  workers.reserve(nthread);
10813
- std::mutex mutex;
10814
11469
 
10815
11470
  int idx = 0;
10816
11471
 
@@ -10862,7 +11517,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10862
11517
  quantize &= !params->only_copy;
10863
11518
 
10864
11519
  // do not quantize expert gating tensors
10865
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
11520
+ // NOTE: can't use LLM_TN here because the layer number is not known
11521
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10866
11522
 
10867
11523
  // do not quantize positional embeddings and token types (BERT)
10868
11524
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
@@ -10906,6 +11562,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10906
11562
  }
10907
11563
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10908
11564
  new_type == GGML_TYPE_IQ2_XS ||
11565
+ new_type == GGML_TYPE_IQ2_S ||
10909
11566
  new_type == GGML_TYPE_IQ1_S ||
10910
11567
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10911
11568
  LLAMA_LOG_ERROR("\n\n============================================================\n");
@@ -10922,7 +11579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10922
11579
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
10923
11580
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
10924
11581
  } else {
10925
- llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
11582
+ llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
10926
11583
  f32_data = (float *) f32_conv_buf.data();
10927
11584
  }
10928
11585
 
@@ -10943,41 +11600,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10943
11600
 
10944
11601
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
10945
11602
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
10946
- if (nthread_use < 2) {
10947
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
10948
- } else {
10949
- int counter = 0;
10950
- new_size = 0;
10951
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
10952
- nrows, n_per_row, imatrix]() {
10953
- std::array<int64_t, 1 << 4> local_hist = {};
10954
- const int nrows_per_chunk = chunk_size / n_per_row;
10955
- size_t local_size = 0;
10956
- while (true) {
10957
- std::unique_lock<std::mutex> lock(mutex);
10958
- int first_row = counter; counter += nrows_per_chunk;
10959
- if (first_row >= nrows) {
10960
- if (local_size > 0) {
10961
- for (int j=0; j<int(local_hist.size()); ++j) {
10962
- hist_cur[j] += local_hist[j];
10963
- }
10964
- new_size += local_size;
10965
- }
10966
- break;
10967
- }
10968
- lock.unlock();
10969
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
10970
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
10971
- first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
10972
- }
10973
- };
10974
- for (int it = 0; it < nthread_use - 1; ++it) {
10975
- workers.emplace_back(compute);
10976
- }
10977
- compute();
10978
- for (auto & w : workers) { w.join(); }
10979
- workers.clear();
10980
- }
11603
+ new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
10981
11604
 
10982
11605
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
10983
11606
  int64_t tot_count = 0;
@@ -11327,7 +11950,7 @@ static int llama_apply_lora_from_file_internal(
11327
11950
  struct llama_model_params llama_model_default_params() {
11328
11951
  struct llama_model_params result = {
11329
11952
  /*.n_gpu_layers =*/ 0,
11330
- /*.split_mode =*/ LLAMA_SPLIT_LAYER,
11953
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11331
11954
  /*.main_gpu =*/ 0,
11332
11955
  /*.tensor_split =*/ nullptr,
11333
11956
  /*.progress_callback =*/ nullptr,
@@ -11353,7 +11976,8 @@ struct llama_context_params llama_context_default_params() {
11353
11976
  /*.n_batch =*/ 512,
11354
11977
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11355
11978
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11356
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
11979
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
11980
+ /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
11357
11981
  /*.rope_freq_base =*/ 0.0f,
11358
11982
  /*.rope_freq_scale =*/ 0.0f,
11359
11983
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11361,15 +11985,16 @@ struct llama_context_params llama_context_default_params() {
11361
11985
  /*.yarn_beta_fast =*/ 32.0f,
11362
11986
  /*.yarn_beta_slow =*/ 1.0f,
11363
11987
  /*.yarn_orig_ctx =*/ 0,
11988
+ /*.defrag_thold =*/ -1.0f,
11364
11989
  /*.cb_eval =*/ nullptr,
11365
11990
  /*.cb_eval_user_data =*/ nullptr,
11366
11991
  /*.type_k =*/ GGML_TYPE_F16,
11367
11992
  /*.type_v =*/ GGML_TYPE_F16,
11368
- /*.mul_mat_q =*/ true,
11369
11993
  /*.logits_all =*/ false,
11370
- /*.embedding =*/ false,
11994
+ /*.embeddings =*/ false,
11371
11995
  /*.offload_kqv =*/ true,
11372
- /*.do_pooling =*/ true,
11996
+ /*.abort_callback =*/ nullptr,
11997
+ /*.abort_callback_data =*/ nullptr,
11373
11998
  };
11374
11999
 
11375
12000
  return result;
@@ -11421,15 +12046,6 @@ bool llama_supports_gpu_offload(void) {
11421
12046
  #endif
11422
12047
  }
11423
12048
 
11424
- // deprecated:
11425
- bool llama_mmap_supported(void) {
11426
- return llama_supports_mmap();
11427
- }
11428
-
11429
- bool llama_mlock_supported(void) {
11430
- return llama_supports_mlock();
11431
- }
11432
-
11433
12049
  void llama_backend_init(void) {
11434
12050
  ggml_time_init();
11435
12051
 
@@ -11525,9 +12141,10 @@ struct llama_context * llama_new_context_with_model(
11525
12141
  cparams.yarn_attn_factor = params.yarn_attn_factor;
11526
12142
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11527
12143
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11528
- cparams.mul_mat_q = params.mul_mat_q;
12144
+ cparams.defrag_thold = params.defrag_thold;
12145
+ cparams.embeddings = params.embeddings;
11529
12146
  cparams.offload_kqv = params.offload_kqv;
11530
- cparams.do_pooling = params.do_pooling;
12147
+ cparams.pooling_type = params.pooling_type;
11531
12148
 
11532
12149
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11533
12150
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -11541,16 +12158,24 @@ struct llama_context * llama_new_context_with_model(
11541
12158
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11542
12159
 
11543
12160
  auto rope_scaling_type = params.rope_scaling_type;
11544
- if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
12161
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
11545
12162
  rope_scaling_type = hparams.rope_scaling_type_train;
11546
12163
  }
11547
12164
 
11548
- if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
12165
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
11549
12166
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11550
12167
  }
11551
12168
 
11552
12169
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11553
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
12170
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
12171
+ }
12172
+
12173
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12174
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12175
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
12176
+ } else {
12177
+ cparams.pooling_type = hparams.pooling_type;
12178
+ }
11554
12179
  }
11555
12180
 
11556
12181
  if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11561,8 +12186,11 @@ struct llama_context * llama_new_context_with_model(
11561
12186
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
11562
12187
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
11563
12188
 
11564
- ctx->rng = std::mt19937(params.seed);
11565
- ctx->logits_all = params.logits_all;
12189
+ ctx->abort_callback = params.abort_callback;
12190
+ ctx->abort_callback_data = params.abort_callback_data;
12191
+
12192
+ ctx->rng = std::mt19937(params.seed);
12193
+ ctx->logits_all = params.logits_all;
11566
12194
 
11567
12195
  const ggml_type type_k = params.type_k;
11568
12196
  const ggml_type type_v = params.type_v;
@@ -11584,8 +12212,8 @@ struct llama_context * llama_new_context_with_model(
11584
12212
  }
11585
12213
  #elif defined(GGML_USE_CUBLAS)
11586
12214
  if (model->n_gpu_layers > 0) {
11587
- // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
11588
- if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
12215
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12216
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
11589
12217
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11590
12218
  if (backend == nullptr) {
11591
12219
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11594,7 +12222,7 @@ struct llama_context * llama_new_context_with_model(
11594
12222
  }
11595
12223
  ctx->backends.push_back(backend);
11596
12224
  } else {
11597
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
12225
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11598
12226
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11599
12227
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11600
12228
  if (backend == nullptr) {
@@ -11620,13 +12248,31 @@ struct llama_context * llama_new_context_with_model(
11620
12248
  }
11621
12249
  #elif defined(GGML_USE_SYCL)
11622
12250
  if (model->n_gpu_layers > 0) {
11623
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
11624
- if (backend == nullptr) {
11625
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
11626
- llama_free(ctx);
11627
- return nullptr;
12251
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12252
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12253
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12254
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
12255
+ if (backend == nullptr) {
12256
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
12257
+ llama_free(ctx);
12258
+ return nullptr;
12259
+ }
12260
+ ctx->backends.push_back(backend);
12261
+ } else {
12262
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
12263
+ int id_list[GGML_SYCL_MAX_DEVICES];
12264
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12265
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12266
+ int device_id = id_list[i];
12267
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12268
+ if (backend == nullptr) {
12269
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12270
+ llama_free(ctx);
12271
+ return nullptr;
12272
+ }
12273
+ ctx->backends.push_back(backend);
12274
+ }
11628
12275
  }
11629
- ctx->backends.push_back(backend);
11630
12276
  }
11631
12277
  #elif defined(GGML_USE_KOMPUTE)
11632
12278
  if (model->n_gpu_layers > 0) {
@@ -11647,8 +12293,7 @@ struct llama_context * llama_new_context_with_model(
11647
12293
  }
11648
12294
  ctx->backends.push_back(ctx->backend_cpu);
11649
12295
 
11650
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
11651
- cparams.n_ctx, cparams.offload_kqv)) {
12296
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
11652
12297
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11653
12298
  llama_free(ctx);
11654
12299
  return nullptr;
@@ -11675,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
11675
12320
  // resized during inference, reserve maximum
11676
12321
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11677
12322
 
11678
- if (params.embedding) {
11679
- ctx->embedding.resize(hparams.n_embd);
12323
+ if (params.embeddings) {
12324
+ ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
11680
12325
  }
11681
12326
 
11682
12327
  // graph inputs
@@ -11707,7 +12352,6 @@ struct llama_context * llama_new_context_with_model(
11707
12352
  ggml_set_name(ctx->inp_cls, "inp_cls");
11708
12353
 
11709
12354
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11710
-
11711
12355
  LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
11712
12356
  ggml_backend_buffer_name(ctx->buf_input),
11713
12357
  ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
@@ -11727,7 +12371,7 @@ struct llama_context * llama_new_context_with_model(
11727
12371
  }
11728
12372
 
11729
12373
  // buffer used to store the computation graph and the tensor meta data
11730
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
12374
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11731
12375
 
11732
12376
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
11733
12377
 
@@ -11796,6 +12440,50 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
11796
12440
  return model->vocab.type;
11797
12441
  }
11798
12442
 
12443
+ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12444
+ switch (model->arch) {
12445
+ // these models do not use RoPE
12446
+ case LLM_ARCH_GPT2:
12447
+ case LLM_ARCH_GPTJ:
12448
+ case LLM_ARCH_GPTNEOX:
12449
+ case LLM_ARCH_MPT:
12450
+ case LLM_ARCH_REFACT:
12451
+ case LLM_ARCH_BLOOM:
12452
+ return LLAMA_ROPE_TYPE_NONE;
12453
+
12454
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
12455
+ case LLM_ARCH_LLAMA:
12456
+ case LLM_ARCH_BAICHUAN:
12457
+ case LLM_ARCH_STARCODER:
12458
+ case LLM_ARCH_PLAMO:
12459
+ case LLM_ARCH_CODESHELL:
12460
+ case LLM_ARCH_ORION:
12461
+ case LLM_ARCH_INTERNLM2:
12462
+ case LLM_ARCH_MINICPM:
12463
+ return LLAMA_ROPE_TYPE_NORM;
12464
+
12465
+ // the pairs of head values are offset by n_rot/2
12466
+ case LLM_ARCH_FALCON:
12467
+ case LLM_ARCH_PERSIMMON:
12468
+ case LLM_ARCH_BERT:
12469
+ case LLM_ARCH_NOMIC_BERT:
12470
+ case LLM_ARCH_STABLELM:
12471
+ case LLM_ARCH_QWEN:
12472
+ case LLM_ARCH_QWEN2:
12473
+ case LLM_ARCH_PHI2:
12474
+ case LLM_ARCH_GEMMA:
12475
+ case LLM_ARCH_STARCODER2:
12476
+ return LLAMA_ROPE_TYPE_NEOX;
12477
+
12478
+ // all model arches should be listed explicitly here
12479
+ case LLM_ARCH_UNKNOWN:
12480
+ GGML_ASSERT(false && "unknown architecture");
12481
+ break;
12482
+ }
12483
+
12484
+ return LLAMA_ROPE_TYPE_NONE;
12485
+ }
12486
+
11799
12487
  int32_t llama_n_vocab(const struct llama_model * model) {
11800
12488
  return model->vocab.id_to_token.size();
11801
12489
  }
@@ -11898,15 +12586,6 @@ uint32_t llama_model_quantize(
11898
12586
  }
11899
12587
  }
11900
12588
 
11901
- int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11902
- try {
11903
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
11904
- } catch (const std::exception & err) {
11905
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
11906
- return 1;
11907
- }
11908
- }
11909
-
11910
12589
  int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11911
12590
  try {
11912
12591
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -12038,12 +12717,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
12038
12717
  llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
12039
12718
  }
12040
12719
 
12041
- void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12720
+ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12042
12721
  if (delta == 0) {
12043
12722
  return;
12044
12723
  }
12045
12724
 
12046
- llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
12725
+ llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
12047
12726
  }
12048
12727
 
12049
12728
  void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
@@ -12054,6 +12733,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
12054
12733
  llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
12055
12734
  }
12056
12735
 
12736
+ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
12737
+ return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
12738
+ }
12739
+
12740
+ void llama_kv_cache_defrag(struct llama_context * ctx) {
12741
+ llama_kv_cache_defrag(ctx->kv_self);
12742
+ }
12743
+
12744
+ void llama_kv_cache_update(struct llama_context * ctx) {
12745
+ llama_kv_cache_update_internal(*ctx);
12746
+ }
12747
+
12748
+
12057
12749
  // Returns the *maximum* size of the state
12058
12750
  size_t llama_get_state_size(const struct llama_context * ctx) {
12059
12751
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -12064,10 +12756,15 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12064
12756
  // assume worst case for logits although only currently set ones are serialized
12065
12757
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
12066
12758
  const size_t s_embedding_size = sizeof(size_t);
12067
- const size_t s_embedding = ctx->embedding.size() * sizeof(float);
12068
- const size_t s_kv_size = sizeof(size_t);
12069
- const size_t s_kv_ntok = sizeof(int);
12759
+ const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
12760
+ const size_t s_kv_buf_size = sizeof(size_t);
12761
+ const size_t s_kv_head = sizeof(uint32_t);
12762
+ const size_t s_kv_size = sizeof(uint32_t);
12763
+ const size_t s_kv_used = sizeof(uint32_t);
12070
12764
  const size_t s_kv = ctx->kv_self.total_size();
12765
+ // TODO: assume the max is more than 1 seq_id per KV cell
12766
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
12767
+ const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
12071
12768
 
12072
12769
  const size_t s_total = (
12073
12770
  + s_rng_size
@@ -12076,9 +12773,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12076
12773
  + s_logits
12077
12774
  + s_embedding_size
12078
12775
  + s_embedding
12776
+ + s_kv_buf_size
12777
+ + s_kv_head
12079
12778
  + s_kv_size
12080
- + s_kv_ntok
12779
+ + s_kv_used
12081
12780
  + s_kv
12781
+ + s_kv_cells
12082
12782
  );
12083
12783
 
12084
12784
  return s_total;
@@ -12165,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12165
12865
 
12166
12866
  // copy embeddings
12167
12867
  {
12168
- const size_t embedding_size = ctx->embedding.size();
12868
+ const size_t embeddings_size = ctx->embd.size();
12169
12869
 
12170
- data_ctx->write(&embedding_size, sizeof(embedding_size));
12870
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
12171
12871
 
12172
- if (embedding_size) {
12173
- data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
12872
+ if (embeddings_size) {
12873
+ data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
12174
12874
  }
12175
12875
  }
12176
12876
 
@@ -12178,15 +12878,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12178
12878
  {
12179
12879
  const auto & kv_self = ctx->kv_self;
12180
12880
  const auto & hparams = ctx->model.hparams;
12181
- const auto & cparams = ctx->cparams;
12182
12881
 
12183
- const auto n_layer = hparams.n_layer;
12184
- const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
12185
- const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
12186
- const auto n_ctx = cparams.n_ctx;
12882
+ const uint32_t n_layer = hparams.n_layer;
12883
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12884
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12187
12885
 
12188
12886
  const size_t kv_buf_size = kv_self.total_size();
12189
- const uint32_t kv_head = kv_self.head;
12887
+ const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
12190
12888
  const uint32_t kv_size = kv_self.size;
12191
12889
  const uint32_t kv_used = kv_self.used;
12192
12890
 
@@ -12198,14 +12896,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12198
12896
  if (kv_buf_size) {
12199
12897
  std::vector<uint8_t> tmp_buf;
12200
12898
  for (int il = 0; il < (int) n_layer; ++il) {
12201
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12899
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12900
+
12202
12901
  tmp_buf.resize(k_size);
12203
12902
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12204
12903
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12205
12904
 
12206
12905
  // v is not contiguous, copy row by row
12207
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12906
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12907
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12908
+
12209
12909
  tmp_buf.resize(v_row_size);
12210
12910
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12211
12911
  ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
@@ -12214,7 +12914,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12214
12914
  }
12215
12915
  }
12216
12916
 
12217
- for (uint32_t i = 0; i < kv_size; ++i) {
12917
+ for (uint32_t i = 0; i < kv_head; ++i) {
12218
12918
  const auto & cell = kv_self.cells[i];
12219
12919
 
12220
12920
  const llama_pos pos = cell.pos;
@@ -12238,8 +12938,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
12238
12938
  }
12239
12939
 
12240
12940
  // Sets the state reading from the specified source address
12241
- size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12242
- uint8_t * inp = src;
12941
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12942
+ const uint8_t * inp = src;
12243
12943
 
12244
12944
  // set rng
12245
12945
  {
@@ -12248,7 +12948,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12248
12948
 
12249
12949
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
12250
12950
 
12251
- std::string rng_str((char *)inp, rng_size); inp += rng_size;
12951
+ std::string rng_str((const char *)inp, rng_size); inp += rng_size;
12252
12952
 
12253
12953
  std::istringstream rng_ss(rng_str);
12254
12954
  rng_ss >> ctx->rng;
@@ -12274,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12274
12974
 
12275
12975
  // set embeddings
12276
12976
  {
12277
- size_t embedding_size;
12977
+ size_t embeddings_size;
12978
+
12979
+ memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
12278
12980
 
12279
- memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
12981
+ GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
12280
12982
 
12281
- GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
12983
+ if (embeddings_size) {
12984
+ ctx->embd.resize(embeddings_size);
12282
12985
 
12283
- if (embedding_size) {
12284
- memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12285
- inp += embedding_size * sizeof(float);
12986
+ memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
12987
+ inp += embeddings_size * sizeof(float);
12286
12988
  }
12287
12989
  }
12288
12990
 
@@ -12290,12 +12992,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12290
12992
  {
12291
12993
  const auto & kv_self = ctx->kv_self;
12292
12994
  const auto & hparams = ctx->model.hparams;
12293
- const auto & cparams = ctx->cparams;
12294
12995
 
12295
- const int n_layer = hparams.n_layer;
12296
- const int n_embd_k_gqa = hparams.n_embd_k_gqa();
12297
- const int n_embd_v_gqa = hparams.n_embd_v_gqa();
12298
- const int n_ctx = cparams.n_ctx;
12996
+ const uint32_t n_layer = hparams.n_layer;
12997
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12998
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12299
12999
 
12300
13000
  size_t kv_buf_size;
12301
13001
  uint32_t kv_head;
@@ -12311,13 +13011,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12311
13011
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12312
13012
 
12313
13013
  for (int il = 0; il < (int) n_layer; ++il) {
12314
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
13014
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
13015
+
12315
13016
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12316
13017
  inp += k_size;
12317
13018
 
12318
13019
  // v is not contiguous, copy row by row
12319
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13020
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
13021
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
13022
+
12321
13023
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12322
13024
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
12323
13025
  inp += v_row_size;
@@ -12325,13 +13027,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12325
13027
  }
12326
13028
  }
12327
13029
 
13030
+ GGML_ASSERT(kv_self.size == kv_size);
13031
+
12328
13032
  ctx->kv_self.head = kv_head;
12329
13033
  ctx->kv_self.size = kv_size;
12330
13034
  ctx->kv_self.used = kv_used;
12331
13035
 
12332
13036
  ctx->kv_self.cells.resize(kv_size);
12333
13037
 
12334
- for (uint32_t i = 0; i < kv_size; ++i) {
13038
+ for (uint32_t i = 0; i < kv_head; ++i) {
12335
13039
  llama_pos pos;
12336
13040
  size_t seq_id_size;
12337
13041
 
@@ -12347,6 +13051,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12347
13051
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
12348
13052
  }
12349
13053
  }
13054
+
13055
+ for (uint32_t i = kv_head; i < kv_size; ++i) {
13056
+ ctx->kv_self.cells[i].pos = -1;
13057
+ ctx->kv_self.cells[i].seq_id.clear();
13058
+ }
12350
13059
  }
12351
13060
 
12352
13061
  const size_t nread = inp - src;
@@ -12439,43 +13148,16 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
12439
13148
  return true;
12440
13149
  }
12441
13150
 
12442
- int llama_eval(
12443
- struct llama_context * ctx,
12444
- llama_token * tokens,
12445
- int32_t n_tokens,
12446
- int32_t n_past) {
12447
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12448
-
12449
- const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
12450
- if (ret < 0) {
12451
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12452
- }
12453
-
12454
- return ret;
12455
- }
12456
-
12457
- int llama_eval_embd(
12458
- struct llama_context * ctx,
12459
- float * embd,
12460
- int32_t n_tokens,
12461
- int32_t n_past) {
12462
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12463
-
12464
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
12465
-
12466
- const int ret = llama_decode_internal(*ctx, batch);
12467
- if (ret < 0) {
12468
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12469
- }
12470
-
12471
- return ret;
12472
- }
12473
-
12474
13151
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
12475
13152
  ctx->cparams.n_threads = n_threads;
12476
13153
  ctx->cparams.n_threads_batch = n_threads_batch;
12477
13154
  }
12478
13155
 
13156
+ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
13157
+ ctx->abort_callback = abort_callback;
13158
+ ctx->abort_callback_data = abort_callback_data;
13159
+ }
13160
+
12479
13161
  struct llama_batch llama_batch_get_one(
12480
13162
  llama_token * tokens,
12481
13163
  int32_t n_tokens,
@@ -12552,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
12552
13234
  }
12553
13235
 
12554
13236
  float * llama_get_embeddings(struct llama_context * ctx) {
12555
- return ctx->embedding.data();
13237
+ return ctx->embd.data();
12556
13238
  }
12557
13239
 
12558
13240
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12559
- return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
13241
+ return ctx->embd.data() + i*ctx->model.hparams.n_embd;
13242
+ }
13243
+
13244
+ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
13245
+ auto it = ctx->embd_seq.find(seq_id);
13246
+ if (it == ctx->embd_seq.end()) {
13247
+ return nullptr;
13248
+ }
13249
+
13250
+ return it->second.data();
12560
13251
  }
12561
13252
 
12562
13253
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
@@ -12730,7 +13421,7 @@ static int32_t llama_chat_apply_template_internal(
12730
13421
  std::string & dest, bool add_ass) {
12731
13422
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12732
13423
  std::stringstream ss;
12733
- if (tmpl.find("<|im_start|>") != std::string::npos) {
13424
+ if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
12734
13425
  // chatml template
12735
13426
  for (auto message : chat) {
12736
13427
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -12738,7 +13429,7 @@ static int32_t llama_chat_apply_template_internal(
12738
13429
  if (add_ass) {
12739
13430
  ss << "<|im_start|>assistant\n";
12740
13431
  }
12741
- } else if (tmpl.find("[INST]") != std::string::npos) {
13432
+ } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
12742
13433
  // llama2 template and its variants
12743
13434
  // [variant] support system message
12744
13435
  bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@@ -12773,7 +13464,7 @@ static int32_t llama_chat_apply_template_internal(
12773
13464
  }
12774
13465
  }
12775
13466
  // llama2 templates seem to not care about "add_generation_prompt"
12776
- } else if (tmpl.find("<|user|>") != std::string::npos) {
13467
+ } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
12777
13468
  // zephyr template
12778
13469
  for (auto message : chat) {
12779
13470
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -12781,7 +13472,7 @@ static int32_t llama_chat_apply_template_internal(
12781
13472
  if (add_ass) {
12782
13473
  ss << "<|assistant|>\n";
12783
13474
  }
12784
- } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
13475
+ } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
12785
13476
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
12786
13477
  for (auto message : chat) {
12787
13478
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -12790,7 +13481,7 @@ static int32_t llama_chat_apply_template_internal(
12790
13481
  if (add_ass) {
12791
13482
  ss << "<s>assistant\n";
12792
13483
  }
12793
- } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
13484
+ } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
12794
13485
  // google/gemma-7b-it
12795
13486
  std::string system_prompt = "";
12796
13487
  for (auto message : chat) {
@@ -12837,23 +13528,27 @@ LLAMA_API int32_t llama_chat_apply_template(
12837
13528
  int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
12838
13529
  if (res < 0) {
12839
13530
  // worst case: there is no information about template, we will use chatml by default
12840
- curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
13531
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
12841
13532
  } else {
12842
13533
  curr_tmpl = std::string(model_template.data(), model_template.size());
12843
13534
  }
12844
13535
  }
13536
+
12845
13537
  // format the chat to string
12846
13538
  std::vector<const llama_chat_message *> chat_vec;
12847
13539
  chat_vec.resize(n_msg);
12848
13540
  for (size_t i = 0; i < n_msg; i++) {
12849
13541
  chat_vec[i] = &chat[i];
12850
13542
  }
13543
+
12851
13544
  std::string formatted_chat;
12852
13545
  int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12853
13546
  if (res < 0) {
12854
13547
  return res;
12855
13548
  }
12856
- strncpy(buf, formatted_chat.c_str(), length);
13549
+ if (buf && length > 0) {
13550
+ strncpy(buf, formatted_chat.c_str(), length);
13551
+ }
12857
13552
  return res;
12858
13553
  }
12859
13554