llama_cpp 0.12.7 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,10 +68,12 @@
68
68
  #include <cstdio>
69
69
  #include <cstring>
70
70
  #include <ctime>
71
+ #include <cwctype>
71
72
  #include <forward_list>
72
73
  #include <fstream>
73
74
  #include <functional>
74
75
  #include <initializer_list>
76
+ #include <locale>
75
77
  #include <map>
76
78
  #include <memory>
77
79
  #include <mutex>
@@ -102,6 +104,7 @@
102
104
  #define LLAMA_MAX_NODES 8192
103
105
  #define LLAMA_MAX_EXPERTS 8
104
106
 
107
+
105
108
  //
106
109
  // logging
107
110
  //
@@ -209,10 +212,11 @@ enum llm_arch {
209
212
  LLM_ARCH_INTERNLM2,
210
213
  LLM_ARCH_MINICPM,
211
214
  LLM_ARCH_GEMMA,
215
+ LLM_ARCH_STARCODER2,
212
216
  LLM_ARCH_UNKNOWN,
213
217
  };
214
218
 
215
- static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
219
+ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
216
220
  { LLM_ARCH_LLAMA, "llama" },
217
221
  { LLM_ARCH_FALCON, "falcon" },
218
222
  { LLM_ARCH_GPT2, "gpt2" },
@@ -236,6 +240,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
236
240
  { LLM_ARCH_INTERNLM2, "internlm2" },
237
241
  { LLM_ARCH_MINICPM, "minicpm" },
238
242
  { LLM_ARCH_GEMMA, "gemma" },
243
+ { LLM_ARCH_STARCODER2, "starcoder2" },
244
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
239
245
  };
240
246
 
241
247
  enum llm_kv {
@@ -296,7 +302,7 @@ enum llm_kv {
296
302
  LLM_KV_TOKENIZER_RWKV,
297
303
  };
298
304
 
299
- static std::map<llm_kv, const char *> LLM_KV_NAMES = {
305
+ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
300
306
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
301
307
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
302
308
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -360,7 +366,7 @@ struct LLM_KV {
360
366
  llm_arch arch;
361
367
 
362
368
  std::string operator()(llm_kv kv) const {
363
- return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
369
+ return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
364
370
  }
365
371
  };
366
372
 
@@ -395,7 +401,7 @@ enum llm_tensor {
395
401
  LLM_TENSOR_LAYER_OUT_NORM,
396
402
  };
397
403
 
398
- static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
404
+ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
399
405
  {
400
406
  LLM_ARCH_LLAMA,
401
407
  {
@@ -777,6 +783,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
777
783
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
778
784
  },
779
785
  },
786
+ {
787
+ LLM_ARCH_STARCODER2,
788
+ {
789
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
790
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
791
+ { LLM_TENSOR_OUTPUT, "output" },
792
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
793
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
794
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
795
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
796
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
797
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
798
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
799
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
800
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
801
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
802
+ },
803
+ },
780
804
  {
781
805
  LLM_ARCH_UNKNOWN,
782
806
  {
@@ -810,38 +834,38 @@ struct LLM_TN {
810
834
  llm_arch arch;
811
835
 
812
836
  std::string operator()(llm_tensor tensor) const {
813
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
837
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
814
838
  return "__missing__";
815
839
  }
816
- return LLM_TENSOR_NAMES[arch].at(tensor);
840
+ return LLM_TENSOR_NAMES.at(arch).at(tensor);
817
841
  }
818
842
 
819
843
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
820
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
844
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
821
845
  return "__missing__";
822
846
  }
823
- return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
847
+ return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
824
848
  }
825
849
 
826
850
  std::string operator()(llm_tensor tensor, int bid) const {
827
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
851
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
828
852
  return "__missing__";
829
853
  }
830
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
854
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
831
855
  }
832
856
 
833
857
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
834
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
858
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
835
859
  return "__missing__";
836
860
  }
837
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
861
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
838
862
  }
839
863
 
840
864
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
841
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
865
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
842
866
  return "__missing__";
843
867
  }
844
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
868
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
845
869
  }
846
870
  };
847
871
 
@@ -849,20 +873,20 @@ struct LLM_TN {
849
873
  // gguf helpers
850
874
  //
851
875
 
852
- static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
853
- { LLAMA_ROPE_SCALING_NONE, "none" },
854
- { LLAMA_ROPE_SCALING_LINEAR, "linear" },
855
- { LLAMA_ROPE_SCALING_YARN, "yarn" },
876
+ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
877
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
878
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
879
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
856
880
  };
857
881
 
858
- static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
882
+ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
859
883
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
860
884
  if (kv.second == name) {
861
- return kv.first;
885
+ return (llama_rope_scaling_type) kv.first;
862
886
  }
863
887
  }
864
888
 
865
- return LLAMA_ROPE_SCALING_UNSPECIFIED;
889
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
866
890
  }
867
891
 
868
892
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1407,7 +1431,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1407
1431
  buft = ggml_backend_cuda_host_buffer_type();
1408
1432
  }
1409
1433
  #elif defined(GGML_USE_SYCL)
1410
- buft = ggml_backend_sycl_host_buffer_type();
1434
+ if (host_buffer) {
1435
+ buft = ggml_backend_sycl_host_buffer_type();
1436
+ }
1411
1437
  #elif defined(GGML_USE_CPU_HBM)
1412
1438
  buft = ggml_backend_cpu_hbm_buffer_type();
1413
1439
  #elif defined(GGML_USE_VULKAN)
@@ -1461,6 +1487,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1461
1487
  }
1462
1488
  #endif
1463
1489
 
1490
+ #ifdef GGML_USE_SYCL
1491
+ if (ggml_backend_sycl_get_device_count() > 1) {
1492
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1493
+ }
1494
+ #endif
1495
+
1464
1496
  if (buft == nullptr) {
1465
1497
  buft = llama_default_buffer_type_offload(fallback_gpu);
1466
1498
  }
@@ -1472,6 +1504,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1472
1504
  static size_t llama_get_device_count() {
1473
1505
  #if defined(GGML_USE_CUBLAS)
1474
1506
  return ggml_backend_cuda_get_device_count();
1507
+ #elif defined(GGML_USE_SYCL)
1508
+ return ggml_backend_sycl_get_device_count();
1475
1509
  #elif defined(GGML_USE_VULKAN)
1476
1510
  return ggml_backend_vk_get_device_count();
1477
1511
  #else
@@ -1485,6 +1519,11 @@ static size_t llama_get_device_memory(int device) {
1485
1519
  size_t free;
1486
1520
  ggml_backend_cuda_get_device_memory(device, &total, &free);
1487
1521
  return free;
1522
+ #elif defined(GGML_USE_SYCL)
1523
+ size_t total;
1524
+ size_t free;
1525
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
1526
+ return free;
1488
1527
  #elif defined(GGML_USE_VULKAN)
1489
1528
  size_t total;
1490
1529
  size_t free;
@@ -1550,8 +1589,9 @@ static const size_t MiB = 1024*kiB;
1550
1589
  static const size_t GiB = 1024*MiB;
1551
1590
 
1552
1591
  struct llama_hparams {
1553
- bool vocab_only;
1554
- bool rope_finetuned;
1592
+ bool vocab_only;
1593
+ bool rope_finetuned;
1594
+
1555
1595
  uint32_t n_vocab;
1556
1596
  uint32_t n_ctx_train; // context size the model was trained on
1557
1597
  uint32_t n_embd;
@@ -1572,7 +1612,6 @@ struct llama_hparams {
1572
1612
  float rope_freq_base_train;
1573
1613
  float rope_freq_scale_train;
1574
1614
  uint32_t n_yarn_orig_ctx;
1575
- int32_t rope_scaling_type_train;
1576
1615
 
1577
1616
  float f_clamp_kqv = 0.0f;
1578
1617
  float f_max_alibi_bias = 0.0f;
@@ -1580,7 +1619,9 @@ struct llama_hparams {
1580
1619
  bool causal_attn = true;
1581
1620
  bool need_kq_pos = false;
1582
1621
 
1583
- uint32_t pooling_type = LLAMA_POOLING_NONE;
1622
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1623
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1624
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
1584
1625
 
1585
1626
  bool operator!=(const llama_hparams & other) const {
1586
1627
  if (this->vocab_only != other.vocab_only) return true;
@@ -1624,13 +1665,13 @@ struct llama_hparams {
1624
1665
  };
1625
1666
 
1626
1667
  struct llama_cparams {
1627
- uint32_t n_ctx; // context size used during inference
1668
+ uint32_t n_ctx; // context size used during inference
1628
1669
  uint32_t n_batch;
1629
1670
  uint32_t n_threads; // number of threads to use for generation
1630
1671
  uint32_t n_threads_batch; // number of threads to use for batch processing
1631
1672
 
1632
- float rope_freq_base;
1633
- float rope_freq_scale;
1673
+ float rope_freq_base;
1674
+ float rope_freq_scale;
1634
1675
 
1635
1676
  uint32_t n_yarn_orig_ctx;
1636
1677
  // These hyperparameters are not exposed in GGUF, because all
@@ -1639,10 +1680,12 @@ struct llama_cparams {
1639
1680
  float yarn_attn_factor;
1640
1681
  float yarn_beta_fast;
1641
1682
  float yarn_beta_slow;
1683
+ float defrag_thold;
1642
1684
 
1643
- bool mul_mat_q;
1685
+ bool embeddings;
1644
1686
  bool offload_kqv;
1645
- bool do_pooling;
1687
+
1688
+ enum llama_pooling_type pooling_type;
1646
1689
 
1647
1690
  ggml_backend_sched_eval_callback cb_eval;
1648
1691
  void * cb_eval_user_data;
@@ -1707,11 +1750,20 @@ struct llama_kv_cell {
1707
1750
  bool has_seq_id(const llama_seq_id & id) const {
1708
1751
  return seq_id.find(id) != seq_id.end();
1709
1752
  }
1753
+
1754
+ bool is_empty() const {
1755
+ return seq_id.empty();
1756
+ }
1757
+
1758
+ bool is_same_seq(const llama_kv_cell & other) const {
1759
+ return seq_id == other.seq_id;
1760
+ }
1710
1761
  };
1711
1762
 
1712
1763
  // ring-buffer of cached KV data
1713
1764
  struct llama_kv_cache {
1714
1765
  bool has_shift = false;
1766
+ bool do_defrag = false;
1715
1767
 
1716
1768
  // Note: The value of head isn't only used to optimize searching
1717
1769
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1723,6 +1775,9 @@ struct llama_kv_cache {
1723
1775
  // computed before each graph build
1724
1776
  uint32_t n = 0;
1725
1777
 
1778
+ ggml_type type_k = GGML_TYPE_F16;
1779
+ ggml_type type_v = GGML_TYPE_F16;
1780
+
1726
1781
  std::vector<llama_kv_cell> cells;
1727
1782
 
1728
1783
  std::vector<struct ggml_tensor *> k_l; // per layer
@@ -1919,7 +1974,7 @@ struct llama_context {
1919
1974
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1920
1975
  int32_t n_eval = 0; // number of eval calls
1921
1976
 
1922
- // decode output (2-dimensional array: [n_tokens][n_vocab])
1977
+ // logits output (2-dimensional array: [n_tokens][n_vocab])
1923
1978
  std::vector<float> logits;
1924
1979
  #ifndef NDEBUG
1925
1980
  // guard against access to unset logits
@@ -1927,13 +1982,21 @@ struct llama_context {
1927
1982
  #endif
1928
1983
  bool logits_all = false;
1929
1984
 
1930
- // input embedding (1-dimensional array: [n_embd])
1931
- std::vector<float> embedding;
1985
+ // embeddings output (2-dimensional array: [n_tokens][n_embd])
1986
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
1987
+ std::vector<float> embd;
1988
+
1989
+ // sequence embeddings output (map of [n_embd] vectors)
1990
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
1991
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
1932
1992
 
1933
1993
  // memory buffers used to evaluate the model
1934
1994
  std::vector<uint8_t> buf_compute_meta;
1935
1995
  ggml_backend_sched_t sched = nullptr;
1936
1996
 
1997
+ ggml_abort_callback abort_callback = nullptr;
1998
+ void * abort_callback_data = nullptr;
1999
+
1937
2000
  // input tensors
1938
2001
  ggml_backend_buffer_t buf_input = nullptr;
1939
2002
  ggml_context * ctx_input = nullptr;
@@ -1958,8 +2021,8 @@ struct llama_context {
1958
2021
  static bool llama_kv_cache_init(
1959
2022
  struct llama_kv_cache & cache,
1960
2023
  const llama_model & model,
1961
- ggml_type ktype,
1962
- ggml_type vtype,
2024
+ ggml_type type_k,
2025
+ ggml_type type_v,
1963
2026
  uint32_t n_ctx,
1964
2027
  bool offload) {
1965
2028
  const struct llama_hparams & hparams = model.hparams;
@@ -1974,6 +2037,9 @@ static bool llama_kv_cache_init(
1974
2037
  cache.size = n_ctx;
1975
2038
  cache.used = 0;
1976
2039
 
2040
+ cache.type_k = type_k;
2041
+ cache.type_v = type_v;
2042
+
1977
2043
  cache.cells.clear();
1978
2044
  cache.cells.resize(n_ctx);
1979
2045
 
@@ -2014,8 +2080,8 @@ static bool llama_kv_cache_init(
2014
2080
 
2015
2081
  for (int i = 0; i < (int) n_layer; i++) {
2016
2082
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
2017
- ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
2018
- ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
2083
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2084
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
2019
2085
  ggml_format_name(k, "cache_k_l%d", i);
2020
2086
  ggml_format_name(v, "cache_v_l%d", i);
2021
2087
  cache.k_l.push_back(k);
@@ -2097,10 +2163,12 @@ static bool llama_kv_cache_find_slot(
2097
2163
  }
2098
2164
 
2099
2165
  // find how many cells are currently in use
2100
- static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2101
- for (uint32_t i = cache.size - 1; i > 0; --i) {
2102
- if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
2103
- return i + 1;
2166
+ static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2167
+ for (uint32_t i = cache.size; i > 0; --i) {
2168
+ const llama_kv_cell & cell = cache.cells[i - 1];
2169
+
2170
+ if (cell.pos >= 0 && !cell.is_empty()) {
2171
+ return i;
2104
2172
  }
2105
2173
  }
2106
2174
 
@@ -2135,7 +2203,7 @@ static void llama_kv_cache_seq_rm(
2135
2203
  } else {
2136
2204
  continue;
2137
2205
  }
2138
- if (cache.cells[i].seq_id.empty()) {
2206
+ if (cache.cells[i].is_empty()) {
2139
2207
  // keep count of the number of used cells
2140
2208
  if (cache.cells[i].pos >= 0) cache.used--;
2141
2209
 
@@ -2186,7 +2254,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
2186
2254
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2187
2255
  }
2188
2256
 
2189
- static void llama_kv_cache_seq_shift(
2257
+ static void llama_kv_cache_seq_add(
2190
2258
  struct llama_kv_cache & cache,
2191
2259
  llama_seq_id seq_id,
2192
2260
  llama_pos p0,
@@ -2204,10 +2272,14 @@ static void llama_kv_cache_seq_shift(
2204
2272
  cache.cells[i].delta += delta;
2205
2273
 
2206
2274
  if (cache.cells[i].pos < 0) {
2207
- if (!cache.cells[i].seq_id.empty()) cache.used--;
2275
+ if (!cache.cells[i].is_empty()) {
2276
+ cache.used--;
2277
+ }
2208
2278
  cache.cells[i].pos = -1;
2209
2279
  cache.cells[i].seq_id.clear();
2210
- if (new_head == cache.size) new_head = i;
2280
+ if (new_head == cache.size) {
2281
+ new_head = i;
2282
+ }
2211
2283
  }
2212
2284
  }
2213
2285
  }
@@ -2239,6 +2311,22 @@ static void llama_kv_cache_seq_div(
2239
2311
  }
2240
2312
  }
2241
2313
 
2314
+ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
2315
+ llama_pos result = 0;
2316
+
2317
+ for (uint32_t i = 0; i < cache.size; ++i) {
2318
+ if (cache.cells[i].has_seq_id(seq_id)) {
2319
+ result = std::max(result, cache.cells[i].pos);
2320
+ }
2321
+ }
2322
+
2323
+ return result;
2324
+ }
2325
+
2326
+ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2327
+ cache.do_defrag = true;
2328
+ }
2329
+
2242
2330
  //
2243
2331
  // model loading and saving
2244
2332
  //
@@ -2310,7 +2398,7 @@ namespace GGUFMeta {
2310
2398
  }
2311
2399
  };
2312
2400
 
2313
- struct ArrayInfo{
2401
+ struct ArrayInfo {
2314
2402
  const gguf_type gt;
2315
2403
  const size_t length;
2316
2404
  const void * data;
@@ -2329,7 +2417,7 @@ namespace GGUFMeta {
2329
2417
  };
2330
2418
 
2331
2419
  template<typename T>
2332
- class GKV: public GKV_Base<T> {
2420
+ class GKV : public GKV_Base<T> {
2333
2421
  GKV() = delete;
2334
2422
 
2335
2423
  public:
@@ -2345,46 +2433,46 @@ namespace GGUFMeta {
2345
2433
 
2346
2434
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2347
2435
  switch (ty) {
2348
- case LLAMA_KV_OVERRIDE_BOOL: return "bool";
2349
- case LLAMA_KV_OVERRIDE_INT: return "int";
2350
- case LLAMA_KV_OVERRIDE_FLOAT: return "float";
2436
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2437
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2438
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2351
2439
  }
2352
2440
  return "unknown";
2353
2441
  }
2354
2442
 
2355
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
2356
- if (!override) { return false; }
2357
- if (override->tag == expected_type) {
2443
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
2444
+ if (!ovrd) { return false; }
2445
+ if (ovrd->tag == expected_type) {
2358
2446
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2359
- __func__, override_type_to_str(override->tag), override->key);
2360
- switch (override->tag) {
2361
- case LLAMA_KV_OVERRIDE_BOOL: {
2362
- LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2447
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
2448
+ switch (ovrd->tag) {
2449
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2450
+ LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2363
2451
  } break;
2364
- case LLAMA_KV_OVERRIDE_INT: {
2365
- LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2452
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
2453
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2366
2454
  } break;
2367
- case LLAMA_KV_OVERRIDE_FLOAT: {
2368
- LLAMA_LOG_INFO("%.6f\n", override->float_value);
2455
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2456
+ LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2369
2457
  } break;
2370
2458
  default:
2371
2459
  // Shouldn't be possible to end up here, but just in case...
2372
2460
  throw std::runtime_error(
2373
2461
  format("Unsupported attempt to override %s type for metadata key %s\n",
2374
- override_type_to_str(override->tag), override->key));
2462
+ override_type_to_str(ovrd->tag), ovrd->key));
2375
2463
  }
2376
2464
  return true;
2377
2465
  }
2378
2466
  LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
2379
- __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
2467
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
2380
2468
  return false;
2381
2469
  }
2382
2470
 
2383
2471
  template<typename OT>
2384
2472
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2385
- try_override(OT & target, const struct llama_model_kv_override *override) {
2386
- if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
2387
- target = override->bool_value;
2473
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2474
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2475
+ target = ovrd->bool_value;
2388
2476
  return true;
2389
2477
  }
2390
2478
  return false;
@@ -2392,9 +2480,9 @@ namespace GGUFMeta {
2392
2480
 
2393
2481
  template<typename OT>
2394
2482
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2395
- try_override(OT & target, const struct llama_model_kv_override *override) {
2396
- if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
2397
- target = override->int_value;
2483
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2484
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2485
+ target = ovrd->int_value;
2398
2486
  return true;
2399
2487
  }
2400
2488
  return false;
@@ -2402,9 +2490,9 @@ namespace GGUFMeta {
2402
2490
 
2403
2491
  template<typename OT>
2404
2492
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2405
- try_override(T & target, const struct llama_model_kv_override *override) {
2406
- if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
2407
- target = override->float_value;
2493
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2494
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2495
+ target = ovrd->float_value;
2408
2496
  return true;
2409
2497
  }
2410
2498
  return false;
@@ -2412,17 +2500,17 @@ namespace GGUFMeta {
2412
2500
 
2413
2501
  template<typename OT>
2414
2502
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2415
- try_override(T & target, const struct llama_model_kv_override *override) {
2503
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2416
2504
  (void)target;
2417
- (void)override;
2418
- if (!override) { return false; }
2505
+ (void)ovrd;
2506
+ if (!ovrd) { return false; }
2419
2507
  // Currently, we should never end up here so it would be a bug if we do.
2420
2508
  throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2421
- override ? override->key : "NULL"));
2509
+ ovrd ? ovrd->key : "NULL"));
2422
2510
  }
2423
2511
 
2424
- static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
2425
- if (try_override<T>(target, override)) {
2512
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2513
+ if (try_override<T>(target, ovrd)) {
2426
2514
  return true;
2427
2515
  }
2428
2516
  if (k < 0) { return false; }
@@ -2430,12 +2518,12 @@ namespace GGUFMeta {
2430
2518
  return true;
2431
2519
  }
2432
2520
 
2433
- static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
2434
- return set(ctx, gguf_find_key(ctx, key), target, override);
2521
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2522
+ return set(ctx, gguf_find_key(ctx, key), target, ovrd);
2435
2523
  }
2436
2524
 
2437
- static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
2438
- return set(ctx, key.c_str(), target, override);
2525
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2526
+ return set(ctx, key.c_str(), target, ovrd);
2439
2527
  }
2440
2528
  };
2441
2529
  }
@@ -2542,9 +2630,12 @@ struct llama_model_loader {
2542
2630
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2543
2631
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2544
2632
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2633
+ case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2545
2634
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
2635
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
2636
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2637
+ case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2638
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
2548
2639
  default:
2549
2640
  {
2550
2641
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2845,6 +2936,19 @@ struct llama_model_loader {
2845
2936
  }
2846
2937
  };
2847
2938
 
2939
+ template<>
2940
+ bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2941
+ uint32_t tmp;
2942
+ const bool found = get_key(kid, tmp, required);
2943
+ if (found) {
2944
+ result = (enum llama_pooling_type) tmp;
2945
+ } else {
2946
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
2947
+ }
2948
+ return found;
2949
+ }
2950
+
2951
+
2848
2952
  //
2849
2953
  // load LLaMA models
2850
2954
  //
@@ -2886,10 +2990,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2886
2990
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2887
2991
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2888
2992
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2889
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2993
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
2994
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
2995
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
2890
2996
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
2997
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
2998
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2999
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3000
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
3001
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
2893
3002
 
2894
3003
  default: return "unknown, may not work";
2895
3004
  }
@@ -2923,16 +3032,16 @@ static const char * llama_model_type_name(e_model type) {
2923
3032
  default: return "?B";
2924
3033
  }
2925
3034
  }
3035
+
2926
3036
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2927
3037
  switch (type) {
2928
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2929
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2930
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2931
- default: return "unknown";
3038
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
3039
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
3040
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
3041
+ default: return "unknown";
2932
3042
  }
2933
3043
  }
2934
3044
 
2935
-
2936
3045
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2937
3046
  model.arch = ml.get_arch();
2938
3047
  if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -2996,7 +3105,7 @@ static void llm_load_hparams(
2996
3105
  std::string rope_scaling("linear");
2997
3106
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2998
3107
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2999
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
3108
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
3000
3109
 
3001
3110
  // rope_freq_scale (inverse of the kv) is optional
3002
3111
  float ropescale = 0.0f;
@@ -3109,10 +3218,10 @@ static void llm_load_hparams(
3109
3218
  } break;
3110
3219
  case LLM_ARCH_BERT:
3111
3220
  {
3112
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3113
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3221
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3222
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3114
3223
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3115
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3224
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
3116
3225
 
3117
3226
  switch (hparams.n_layer) {
3118
3227
  case 3:
@@ -3130,10 +3239,10 @@ static void llm_load_hparams(
3130
3239
  } break;
3131
3240
  case LLM_ARCH_NOMIC_BERT:
3132
3241
  {
3133
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3242
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3243
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
3244
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3245
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
3246
 
3138
3247
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
3248
  model.type = e_model::MODEL_137M;
@@ -3264,6 +3373,16 @@ static void llm_load_hparams(
3264
3373
  default: model.type = e_model::MODEL_UNKNOWN;
3265
3374
  }
3266
3375
  } break;
3376
+ case LLM_ARCH_STARCODER2:
3377
+ {
3378
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3379
+ switch (hparams.n_layer) {
3380
+ case 30: model.type = e_model::MODEL_3B; break;
3381
+ case 32: model.type = e_model::MODEL_7B; break;
3382
+ case 40: model.type = e_model::MODEL_15B; break;
3383
+ default: model.type = e_model::MODEL_UNKNOWN;
3384
+ }
3385
+ } break;
3267
3386
  default: (void)0;
3268
3387
  }
3269
3388
 
@@ -3272,6 +3391,8 @@ static void llm_load_hparams(
3272
3391
  if (hparams.f_max_alibi_bias > 0.0f) {
3273
3392
  hparams.need_kq_pos = true;
3274
3393
  }
3394
+
3395
+ hparams.rope_type = llama_rope_type(&model);
3275
3396
  }
3276
3397
 
3277
3398
  // TODO: This should probably be in llama.h
@@ -3574,6 +3695,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3574
3695
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3575
3696
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3576
3697
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3698
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3699
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3577
3700
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3578
3701
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3579
3702
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3640,7 +3763,7 @@ static bool llm_load_tensors(
3640
3763
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3641
3764
  }
3642
3765
 
3643
- if (split_mode == LLAMA_SPLIT_LAYER) {
3766
+ if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
3644
3767
  // calculate the split points
3645
3768
  int device_count = llama_get_device_count();
3646
3769
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3679,10 +3802,10 @@ static bool llm_load_tensors(
3679
3802
  }
3680
3803
  } else {
3681
3804
  ggml_backend_buffer_type_t split_buft;
3682
- if (split_mode == LLAMA_SPLIT_ROW) {
3805
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
3683
3806
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3684
3807
  } else {
3685
- // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3808
+ // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3686
3809
  split_buft = llama_default_buffer_type_offload(main_gpu);
3687
3810
  }
3688
3811
  // assign the repeating layers
@@ -4430,6 +4553,56 @@ static bool llm_load_tensors(
4430
4553
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4431
4554
  }
4432
4555
  } break;
4556
+ case LLM_ARCH_STARCODER2:
4557
+ {
4558
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4559
+
4560
+ // output
4561
+ {
4562
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4563
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4564
+
4565
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4566
+ // if output is NULL, init from the input tok embed
4567
+ if (model.output == NULL) {
4568
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4569
+ ml.n_created--; // artificial tensor
4570
+ ml.size_data += ggml_nbytes(model.output);
4571
+ }
4572
+
4573
+ }
4574
+
4575
+ for (int i = 0; i < n_layer; ++i) {
4576
+ ggml_context * ctx_layer = ctx_for_layer(i);
4577
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4578
+
4579
+ auto & layer = model.layers[i];
4580
+
4581
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4582
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4583
+
4584
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4585
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4586
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4587
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4588
+
4589
+ // optional bias tensors
4590
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4591
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4592
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4593
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4594
+
4595
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4596
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4597
+
4598
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4599
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4600
+
4601
+ // optional bias tensors
4602
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4603
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
4604
+ }
4605
+ } break;
4433
4606
  default:
4434
4607
  throw std::runtime_error("unknown architecture");
4435
4608
  }
@@ -4595,12 +4768,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4595
4768
 
4596
4769
  using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
4597
4770
 
4598
- enum llm_rope_type {
4599
- LLM_ROPE,
4600
- LLM_ROPE_NEOX,
4601
- LLM_ROPE_GLM,
4602
- };
4603
-
4604
4771
  enum llm_ffn_op_type {
4605
4772
  LLM_FFN_SILU,
4606
4773
  LLM_FFN_GELU,
@@ -4646,55 +4813,6 @@ static struct ggml_tensor * llm_build_inp_embd(
4646
4813
  return inpL;
4647
4814
  }
4648
4815
 
4649
- // Persimmon: n_rot = n_embd_head_k/2
4650
- // Other: n_rot = n_embd_head_k
4651
- static void llm_build_k_shift(
4652
- struct ggml_context * ctx,
4653
- const llama_hparams & hparams,
4654
- const llama_cparams & cparams,
4655
- const llama_kv_cache & kv,
4656
- struct ggml_cgraph * graph,
4657
- struct ggml_tensor * K_shift,
4658
- llm_rope_type type,
4659
- int64_t n_ctx,
4660
- float freq_base,
4661
- float freq_scale,
4662
- const llm_build_cb & cb) {
4663
- const int64_t n_layer = hparams.n_layer;
4664
- const int64_t n_head_kv = hparams.n_head_kv;
4665
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
4666
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4667
- const int32_t n_rot = hparams.n_rot;
4668
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4669
- const float ext_factor = cparams.yarn_ext_factor;
4670
- const float attn_factor = cparams.yarn_attn_factor;
4671
- const float beta_fast = cparams.yarn_beta_fast;
4672
- const float beta_slow = cparams.yarn_beta_slow;
4673
-
4674
- int rope_type = 0;
4675
-
4676
- switch (type) {
4677
- case LLM_ROPE: rope_type = 0; break;
4678
- case LLM_ROPE_NEOX: rope_type = 2; break;
4679
- case LLM_ROPE_GLM: rope_type = 4; break;
4680
- }
4681
-
4682
- for (int il = 0; il < n_layer; ++il) {
4683
- struct ggml_tensor * tmp =
4684
- // we rotate only the first n_rot dimensions
4685
- ggml_rope_custom_inplace(ctx,
4686
- ggml_view_3d(ctx, kv.k_l[il],
4687
- n_embd_head_k, n_head_kv, n_ctx,
4688
- ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4689
- ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4690
- 0),
4691
- K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4692
- ext_factor, attn_factor, beta_fast, beta_slow);
4693
- cb(tmp, "K_shifted", il);
4694
- ggml_build_forward_expand(graph, tmp);
4695
- }
4696
- }
4697
-
4698
4816
  static void llm_build_kv_store(
4699
4817
  struct ggml_context * ctx,
4700
4818
  const llama_hparams & hparams,
@@ -4896,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
4896
5014
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4897
5015
  }
4898
5016
 
4899
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
5017
+ #if defined(GGML_USE_KOMPUTE)
5018
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
4901
5019
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
5020
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
5021
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4981,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
4981
5099
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4982
5100
 
4983
5101
  struct ggml_tensor * cur;
5102
+
4984
5103
  cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4985
5104
  q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4986
5105
  cb(cur, "kqv_out", il);
@@ -4998,6 +5117,7 @@ struct llm_build_context {
4998
5117
 
4999
5118
  const int64_t n_embd;
5000
5119
  const int64_t n_layer;
5120
+ const int64_t n_rot;
5001
5121
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
5002
5122
  const int64_t n_head;
5003
5123
  const int64_t n_head_kv;
@@ -5022,8 +5142,8 @@ struct llm_build_context {
5022
5142
  const int32_t kv_head; // index of where we store new KV data in the cache
5023
5143
  const int32_t n_orig_ctx;
5024
5144
 
5025
- const bool do_rope_shift;
5026
- const uint32_t pooling_type;
5145
+ const enum llama_pooling_type pooling_type;
5146
+ const enum llama_rope_type rope_type;
5027
5147
 
5028
5148
  const llm_build_cb & cb;
5029
5149
 
@@ -5045,6 +5165,7 @@ struct llm_build_context {
5045
5165
  kv_self (lctx.kv_self),
5046
5166
  n_embd (hparams.n_embd),
5047
5167
  n_layer (hparams.n_layer),
5168
+ n_rot (hparams.n_rot),
5048
5169
  n_ctx (cparams.n_ctx),
5049
5170
  n_head (hparams.n_head),
5050
5171
  n_head_kv (hparams.n_head_kv),
@@ -5066,8 +5187,8 @@ struct llm_build_context {
5066
5187
  n_kv (worst_case ? n_ctx : kv_self.n),
5067
5188
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5068
5189
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5069
- do_rope_shift (worst_case || kv_self.has_shift),
5070
- pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
5190
+ pooling_type (cparams.pooling_type),
5191
+ rope_type (hparams.rope_type),
5071
5192
  cb (cb),
5072
5193
  buf_compute_meta (lctx.buf_compute_meta) {
5073
5194
  // all initializations should be done in init()
@@ -5090,6 +5211,76 @@ struct llm_build_context {
5090
5211
  }
5091
5212
  }
5092
5213
 
5214
+ struct ggml_cgraph * build_k_shift() {
5215
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5216
+
5217
+ for (int il = 0; il < n_layer; ++il) {
5218
+ struct ggml_tensor * tmp =
5219
+ // we rotate only the first n_rot dimensions
5220
+ ggml_rope_custom_inplace(ctx0,
5221
+ ggml_view_3d(ctx0, kv_self.k_l[il],
5222
+ n_embd_head_k, n_head_kv, n_ctx,
5223
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
5224
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5225
+ 0),
5226
+ lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5227
+ ext_factor, attn_factor, beta_fast, beta_slow);
5228
+ cb(tmp, "K_shifted", il);
5229
+ ggml_build_forward_expand(gf, tmp);
5230
+ }
5231
+
5232
+ return gf;
5233
+ }
5234
+
5235
+ struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5236
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5237
+
5238
+ for (uint32_t i = 0; i < ids.size(); ++i) {
5239
+ const uint32_t id = ids[i];
5240
+
5241
+ if (i == id || id == ids.size()) {
5242
+ continue;
5243
+ }
5244
+
5245
+ uint32_t nm = 1;
5246
+
5247
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
5248
+ nm++;
5249
+ }
5250
+
5251
+ for (int il = 0; il < n_layer; ++il) {
5252
+ ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5253
+ n_embd_k_gqa, nm,
5254
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5255
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5256
+
5257
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5258
+ n_embd_k_gqa, nm,
5259
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5260
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5261
+
5262
+ ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5263
+ nm, n_embd_v_gqa,
5264
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5265
+ ggml_row_size(kv_self.v_l[il]->type, i));
5266
+
5267
+ ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5268
+ nm, n_embd_v_gqa,
5269
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5270
+ ggml_row_size(kv_self.v_l[il]->type, id));
5271
+
5272
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5273
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5274
+ }
5275
+
5276
+ i += nm - 1;
5277
+ }
5278
+
5279
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
5280
+
5281
+ return gf;
5282
+ }
5283
+
5093
5284
  struct ggml_cgraph * build_llama() {
5094
5285
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5095
5286
 
@@ -5111,11 +5302,6 @@ struct llm_build_context {
5111
5302
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5112
5303
  cb(KQ_mask, "KQ_mask", -1);
5113
5304
 
5114
- // shift the entire K-cache if needed
5115
- if (do_rope_shift) {
5116
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5117
- }
5118
-
5119
5305
  for (int il = 0; il < n_layer; ++il) {
5120
5306
  struct ggml_tensor * inpSA = inpL;
5121
5307
 
@@ -5151,14 +5337,14 @@ struct llm_build_context {
5151
5337
 
5152
5338
  Qcur = ggml_rope_custom(
5153
5339
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5154
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5340
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5155
5341
  ext_factor, attn_factor, beta_fast, beta_slow
5156
5342
  );
5157
5343
  cb(Qcur, "Qcur", il);
5158
5344
 
5159
5345
  Kcur = ggml_rope_custom(
5160
5346
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5161
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5347
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5162
5348
  ext_factor, attn_factor, beta_fast, beta_slow
5163
5349
  );
5164
5350
  cb(Kcur, "Kcur", il);
@@ -5299,11 +5485,6 @@ struct llm_build_context {
5299
5485
  struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
5486
  cb(KQ_pos, "KQ_pos", -1);
5301
5487
 
5302
- // shift the entire K-cache if needed
5303
- if (do_rope_shift) {
5304
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5305
- }
5306
-
5307
5488
  for (int il = 0; il < n_layer; ++il) {
5308
5489
  struct ggml_tensor * inpSA = inpL;
5309
5490
 
@@ -5327,12 +5508,12 @@ struct llm_build_context {
5327
5508
  case MODEL_7B:
5328
5509
  Qcur = ggml_rope_custom(
5329
5510
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5330
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5511
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5331
5512
  ext_factor, attn_factor, beta_fast, beta_slow
5332
5513
  );
5333
5514
  Kcur = ggml_rope_custom(
5334
5515
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5335
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5516
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5336
5517
  ext_factor, attn_factor, beta_fast, beta_slow
5337
5518
  );
5338
5519
  break;
@@ -5417,11 +5598,6 @@ struct llm_build_context {
5417
5598
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5418
5599
  cb(KQ_mask, "KQ_mask", -1);
5419
5600
 
5420
- // shift the entire K-cache if needed
5421
- if (do_rope_shift) {
5422
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5423
- }
5424
-
5425
5601
  for (int il = 0; il < n_layer; ++il) {
5426
5602
  struct ggml_tensor * attn_norm;
5427
5603
 
@@ -5460,13 +5636,13 @@ struct llm_build_context {
5460
5636
 
5461
5637
  // using mode = 2 for neox mode
5462
5638
  Qcur = ggml_rope_custom(
5463
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5639
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5464
5640
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5465
5641
  );
5466
5642
  cb(Qcur, "Qcur", il);
5467
5643
 
5468
5644
  Kcur = ggml_rope_custom(
5469
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5645
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5470
5646
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5471
5647
  );
5472
5648
  cb(Kcur, "Kcur", il);
@@ -5636,10 +5812,6 @@ struct llm_build_context {
5636
5812
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5637
5813
  cb(KQ_mask, "KQ_mask", -1);
5638
5814
 
5639
- if (do_rope_shift) {
5640
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5641
- }
5642
-
5643
5815
  for (int il = 0; il < n_layer; ++il) {
5644
5816
  struct ggml_tensor * residual = inpL;
5645
5817
 
@@ -5697,7 +5869,7 @@ struct llm_build_context {
5697
5869
 
5698
5870
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5699
5871
  struct ggml_tensor * qrot = ggml_view_3d(
5700
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5872
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5701
5873
  ggml_element_size(tmpq) * n_embd_head,
5702
5874
  ggml_element_size(tmpq) * n_embd_head * n_head,
5703
5875
  0
@@ -5705,7 +5877,7 @@ struct llm_build_context {
5705
5877
  cb(qrot, "qrot", il);
5706
5878
 
5707
5879
  struct ggml_tensor * krot = ggml_view_3d(
5708
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5880
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5709
5881
  ggml_element_size(tmpk) * n_embd_head,
5710
5882
  ggml_element_size(tmpk) * n_embd_head * n_head,
5711
5883
  0
@@ -5714,29 +5886,29 @@ struct llm_build_context {
5714
5886
 
5715
5887
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5716
5888
  struct ggml_tensor * qpass = ggml_view_3d(
5717
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5889
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5718
5890
  ggml_element_size(tmpq) * n_embd_head,
5719
5891
  ggml_element_size(tmpq) * n_embd_head * n_head,
5720
- ggml_element_size(tmpq) * hparams.n_rot
5892
+ ggml_element_size(tmpq) * n_rot
5721
5893
  );
5722
5894
  cb(qpass, "qpass", il);
5723
5895
 
5724
5896
  struct ggml_tensor * kpass = ggml_view_3d(
5725
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5897
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5726
5898
  ggml_element_size(tmpk) * n_embd_head,
5727
5899
  ggml_element_size(tmpk) * n_embd_head * n_head,
5728
- ggml_element_size(tmpk) * hparams.n_rot
5900
+ ggml_element_size(tmpk) * n_rot
5729
5901
  );
5730
5902
  cb(kpass, "kpass", il);
5731
5903
 
5732
5904
  struct ggml_tensor * qrotated = ggml_rope_custom(
5733
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5905
+ ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5734
5906
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5735
5907
  );
5736
5908
  cb(qrotated, "qrotated", il);
5737
5909
 
5738
5910
  struct ggml_tensor * krotated = ggml_rope_custom(
5739
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5911
+ ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5740
5912
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5741
5913
  );
5742
5914
  cb(krotated, "krotated", il);
@@ -5921,6 +6093,7 @@ struct llm_build_context {
5921
6093
 
5922
6094
  const int64_t n_embd_head = hparams.n_embd_head_v;
5923
6095
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6096
+
5924
6097
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5925
6098
 
5926
6099
  struct ggml_tensor * cur;
@@ -5928,9 +6101,10 @@ struct llm_build_context {
5928
6101
 
5929
6102
  // get input vectors with right size
5930
6103
  const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5931
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6104
+
6105
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5932
6106
  struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5933
- struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
6107
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5934
6108
 
5935
6109
  // construct input embeddings (token, type, position)
5936
6110
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5948,39 +6122,38 @@ struct llm_build_context {
5948
6122
  cb(inpL, "inp_norm", -1);
5949
6123
 
5950
6124
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5951
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5952
- cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
6125
+ struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
6126
+ cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
5953
6127
 
5954
6128
  // iterate layers
5955
6129
  for (int il = 0; il < n_layer; ++il) {
5956
6130
  struct ggml_tensor * cur = inpL;
5957
6131
 
6132
+ struct ggml_tensor * Qcur;
6133
+ struct ggml_tensor * Kcur;
6134
+ struct ggml_tensor * Vcur;
6135
+
5958
6136
  // self-attention
5959
6137
  if (model.arch == LLM_ARCH_BERT) {
5960
- struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6138
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5961
6139
  cb(Qcur, "Qcur", il);
5962
6140
 
5963
- struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6141
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5964
6142
  cb(Kcur, "Kcur", il);
5965
6143
 
5966
- struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6144
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5967
6145
  cb(Vcur, "Vcur", il);
5968
6146
 
5969
- // seems like we just need to do this for Q?
5970
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5971
-
5972
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5973
- model.layers[il].wo, model.layers[il].bo,
5974
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5975
- cb(cur, "kqv_out", il);
6147
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6148
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5976
6149
  } else {
5977
6150
  // compute Q and K and RoPE them
5978
6151
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5979
6152
  cb(cur, "wqkv", il);
5980
6153
 
5981
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5982
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5983
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6154
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6155
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6156
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5984
6157
 
5985
6158
  cb(Qcur, "Qcur", il);
5986
6159
  cb(Kcur, "Kcur", il);
@@ -5988,24 +6161,52 @@ struct llm_build_context {
5988
6161
 
5989
6162
  Qcur = ggml_rope_custom(
5990
6163
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6164
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5992
6165
  ext_factor, attn_factor, beta_fast, beta_slow
5993
6166
  );
5994
6167
  cb(Qcur, "Qcur", il);
5995
6168
 
5996
6169
  Kcur = ggml_rope_custom(
5997
6170
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6171
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5999
6172
  ext_factor, attn_factor, beta_fast, beta_slow
6000
6173
  );
6001
6174
  cb(Kcur, "Kcur", il);
6175
+ }
6002
6176
 
6003
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6004
- model.layers[il].wo, model.layers[il].bo,
6005
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6006
- cb(cur, "kqv_out", il);
6177
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
6178
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
6179
+
6180
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
6181
+ cb(kq, "kq", il);
6182
+
6183
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
6184
+ cb(kq, "kq_soft_max_ext", il);
6185
+
6186
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
6187
+ cb(v, "v", il);
6188
+
6189
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
6190
+ cb(kqv, "kqv", il);
6191
+
6192
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
6193
+ cb(kqv_merged, "kqv_merged", il);
6194
+
6195
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
6196
+ cb(cur, "kqv_merged_cont", il);
6197
+
6198
+ ggml_build_forward_expand(gf, cur);
6199
+
6200
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6201
+ if (model.layers[il].bo) {
6202
+ cb(cur, "kqv_wo", il);
6007
6203
  }
6008
6204
 
6205
+ if (model.layers[il].bo) {
6206
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
6207
+ }
6208
+ cb(cur, "kqv_out", il);
6209
+
6009
6210
  // re-add the layer input
6010
6211
  cur = ggml_add(ctx0, cur, inpL);
6011
6212
 
@@ -6045,16 +6246,29 @@ struct llm_build_context {
6045
6246
 
6046
6247
  // final output
6047
6248
  cur = inpL;
6249
+ cb(cur, "result_embd", -1);
6048
6250
 
6049
6251
  // pooling layer
6050
- if (pooling_type == LLAMA_POOLING_MEAN) {
6051
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
- } else if (pooling_type == LLAMA_POOLING_CLS) {
6053
- cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
- } else {
6055
- GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6252
+ switch (pooling_type) {
6253
+ case LLAMA_POOLING_TYPE_NONE:
6254
+ {
6255
+ // nop
6256
+ } break;
6257
+ case LLAMA_POOLING_TYPE_MEAN:
6258
+ {
6259
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6260
+ cb(cur, "result_embd_pooled", -1);
6261
+ } break;
6262
+ case LLAMA_POOLING_TYPE_CLS:
6263
+ {
6264
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6265
+ cb(cur, "result_embd_pooled", -1);
6266
+ } break;
6267
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
6268
+ {
6269
+ GGML_ASSERT(false && "Invalid pooling type");
6270
+ } break;
6056
6271
  }
6057
- cb(cur, "result_embd", -1);
6058
6272
 
6059
6273
  ggml_build_forward_expand(gf, cur);
6060
6274
 
@@ -6284,11 +6498,6 @@ struct llm_build_context {
6284
6498
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6285
6499
  cb(KQ_mask, "KQ_mask", -1);
6286
6500
 
6287
- // shift the entire K-cache if needed
6288
- if (do_rope_shift) {
6289
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6290
- }
6291
-
6292
6501
  for (int il = 0; il < n_layer; ++il) {
6293
6502
  struct ggml_tensor * inpSA = inpL;
6294
6503
 
@@ -6325,14 +6534,14 @@ struct llm_build_context {
6325
6534
 
6326
6535
  Qcur = ggml_rope_custom(
6327
6536
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6328
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6537
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6329
6538
  ext_factor, attn_factor, beta_fast, beta_slow
6330
6539
  );
6331
6540
  cb(Qcur, "Qcur", il);
6332
6541
 
6333
6542
  Kcur = ggml_rope_custom(
6334
6543
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6335
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6544
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6336
6545
  ext_factor, attn_factor, beta_fast, beta_slow
6337
6546
  );
6338
6547
  cb(Kcur, "Kcur", il);
@@ -6407,11 +6616,6 @@ struct llm_build_context {
6407
6616
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6408
6617
  cb(KQ_mask, "KQ_mask", -1);
6409
6618
 
6410
- // shift the entire K-cache if needed
6411
- if (do_rope_shift) {
6412
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6413
- }
6414
-
6415
6619
  for (int il = 0; il < n_layer; ++il) {
6416
6620
  struct ggml_tensor * inpSA = inpL;
6417
6621
 
@@ -6441,13 +6645,13 @@ struct llm_build_context {
6441
6645
 
6442
6646
  // using mode = 2 for neox mode
6443
6647
  Qcur = ggml_rope_custom(
6444
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6648
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6445
6649
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6446
6650
  );
6447
6651
  cb(Qcur, "Qcur", il);
6448
6652
 
6449
6653
  Kcur = ggml_rope_custom(
6450
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6654
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6451
6655
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6452
6656
  );
6453
6657
  cb(Kcur, "Kcur", il);
@@ -6521,11 +6725,6 @@ struct llm_build_context {
6521
6725
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6522
6726
  cb(KQ_mask, "KQ_mask", -1);
6523
6727
 
6524
- // shift the entire K-cache if needed
6525
- if (do_rope_shift) {
6526
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6527
- }
6528
-
6529
6728
  for (int il = 0; il < n_layer; ++il) {
6530
6729
  struct ggml_tensor * inpSA = inpL;
6531
6730
 
@@ -6561,14 +6760,14 @@ struct llm_build_context {
6561
6760
 
6562
6761
  Qcur = ggml_rope_custom(
6563
6762
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6564
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6763
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6565
6764
  ext_factor, attn_factor, beta_fast, beta_slow
6566
6765
  );
6567
6766
  cb(Qcur, "Qcur", il);
6568
6767
 
6569
6768
  Kcur = ggml_rope_custom(
6570
6769
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6571
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6770
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6572
6771
  ext_factor, attn_factor, beta_fast, beta_slow
6573
6772
  );
6574
6773
  cb(Kcur, "Kcur", il);
@@ -6642,11 +6841,6 @@ struct llm_build_context {
6642
6841
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6643
6842
  cb(KQ_mask, "KQ_mask", -1);
6644
6843
 
6645
- // shift the entire K-cache if needed
6646
- if (do_rope_shift) {
6647
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6648
- }
6649
-
6650
6844
  for (int il = 0; il < n_layer; ++il) {
6651
6845
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
6652
6846
  model.layers[il].attn_norm,
@@ -6684,7 +6878,7 @@ struct llm_build_context {
6684
6878
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6685
6879
 
6686
6880
  Qcur = ggml_rope_custom(
6687
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6881
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6688
6882
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6689
6883
  );
6690
6884
  cb(Qcur, "Qcur", il);
@@ -6695,7 +6889,7 @@ struct llm_build_context {
6695
6889
  cb(Qcur, "Qcur", il);
6696
6890
 
6697
6891
  Kcur = ggml_rope_custom(
6698
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6892
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6699
6893
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6700
6894
  );
6701
6895
  cb(Kcur, "Kcur", il);
@@ -6764,11 +6958,6 @@ struct llm_build_context {
6764
6958
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6765
6959
  cb(KQ_mask, "KQ_mask", -1);
6766
6960
 
6767
- // shift the entire K-cache if needed
6768
- if (do_rope_shift) {
6769
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6770
- }
6771
-
6772
6961
  for (int il = 0; il < n_layer; ++il) {
6773
6962
 
6774
6963
  // norm
@@ -6792,14 +6981,14 @@ struct llm_build_context {
6792
6981
  cb(Vcur, "Vcur", il);
6793
6982
 
6794
6983
  Qcur = ggml_rope_custom(
6795
- ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
6796
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6984
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
6985
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6797
6986
  ext_factor, attn_factor, beta_fast, beta_slow);
6798
6987
  cb(Qcur, "Qcur", il);
6799
6988
 
6800
6989
  Kcur = ggml_rope_custom(
6801
- ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
6802
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6990
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
6991
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6803
6992
  ext_factor, attn_factor, beta_fast, beta_slow);
6804
6993
  cb(Kcur, "Kcur", il);
6805
6994
 
@@ -6969,11 +7158,6 @@ struct llm_build_context {
6969
7158
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6970
7159
  cb(KQ_mask, "KQ_mask", -1);
6971
7160
 
6972
- // shift the entire K-cache if needed
6973
- if (do_rope_shift) {
6974
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6975
- }
6976
-
6977
7161
  for (int il = 0; il < n_layer; ++il) {
6978
7162
  cur = llm_build_norm(ctx0, inpL, hparams,
6979
7163
  model.layers[il].attn_norm,
@@ -6999,14 +7183,14 @@ struct llm_build_context {
6999
7183
 
7000
7184
  struct ggml_tensor * Qcur = ggml_rope_custom(
7001
7185
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
7002
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7186
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7003
7187
  ext_factor, attn_factor, beta_fast, beta_slow
7004
7188
  );
7005
7189
  cb(Qcur, "Qcur", il);
7006
7190
 
7007
7191
  struct ggml_tensor * Kcur = ggml_rope_custom(
7008
7192
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
7009
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7193
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7010
7194
  ext_factor, attn_factor, beta_fast, beta_slow
7011
7195
  );
7012
7196
  cb(Kcur, "Kcur", il);
@@ -7077,11 +7261,6 @@ struct llm_build_context {
7077
7261
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7078
7262
  cb(KQ_mask, "KQ_mask", -1);
7079
7263
 
7080
- // shift the entire K-cache if needed
7081
- if (do_rope_shift) {
7082
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7083
- }
7084
-
7085
7264
  for (int il = 0; il < n_layer; ++il) {
7086
7265
  struct ggml_tensor * inpSA = inpL;
7087
7266
 
@@ -7117,14 +7296,14 @@ struct llm_build_context {
7117
7296
 
7118
7297
  Qcur = ggml_rope_custom(
7119
7298
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7120
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7299
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7121
7300
  ext_factor, attn_factor, beta_fast, beta_slow
7122
7301
  );
7123
7302
  cb(Qcur, "Qcur", il);
7124
7303
 
7125
7304
  Kcur = ggml_rope_custom(
7126
7305
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7127
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7306
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7128
7307
  ext_factor, attn_factor, beta_fast, beta_slow
7129
7308
  );
7130
7309
  cb(Kcur, "Kcur", il);
@@ -7196,11 +7375,6 @@ struct llm_build_context {
7196
7375
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7197
7376
  cb(KQ_mask, "KQ_mask", -1);
7198
7377
 
7199
- // shift the entire K-cache if needed
7200
- if (do_rope_shift) {
7201
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7202
- }
7203
-
7204
7378
  for (int il = 0; il < n_layer; ++il) {
7205
7379
  struct ggml_tensor * inpSA = inpL;
7206
7380
 
@@ -7236,14 +7410,14 @@ struct llm_build_context {
7236
7410
 
7237
7411
  Qcur = ggml_rope_custom(
7238
7412
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7239
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7413
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7240
7414
  ext_factor, attn_factor, beta_fast, beta_slow
7241
7415
  );
7242
7416
  cb(Qcur, "Qcur", il);
7243
7417
 
7244
7418
  Kcur = ggml_rope_custom(
7245
7419
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7246
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7420
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7421
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7422
  );
7249
7423
  cb(Kcur, "Kcur", il);
@@ -7328,11 +7502,6 @@ struct llm_build_context {
7328
7502
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7329
7503
  cb(KQ_mask, "KQ_mask", -1);
7330
7504
 
7331
- // shift the entire K-cache if needed
7332
- if (do_rope_shift) {
7333
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7334
- }
7335
-
7336
7505
  for (int il = 0; il < n_layer; ++il) {
7337
7506
  struct ggml_tensor * inpSA = inpL;
7338
7507
 
@@ -7368,14 +7537,14 @@ struct llm_build_context {
7368
7537
 
7369
7538
  Qcur = ggml_rope_custom(
7370
7539
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7371
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7540
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7372
7541
  ext_factor, attn_factor, beta_fast, beta_slow
7373
7542
  );
7374
7543
  cb(Qcur, "Qcur", il);
7375
7544
 
7376
7545
  Kcur = ggml_rope_custom(
7377
7546
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7378
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7547
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7379
7548
  ext_factor, attn_factor, beta_fast, beta_slow
7380
7549
  );
7381
7550
  cb(Kcur, "Kcur", il);
@@ -7464,11 +7633,6 @@ struct llm_build_context {
7464
7633
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
7634
  cb(KQ_mask, "KQ_mask", -1);
7466
7635
 
7467
- // shift the entire K-cache if needed
7468
- if (do_rope_shift) {
7469
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
- }
7471
-
7472
7636
  for (int il = 0; il < n_layer; ++il) {
7473
7637
 
7474
7638
  // norm
@@ -7491,7 +7655,7 @@ struct llm_build_context {
7491
7655
 
7492
7656
  Qcur = ggml_rope_custom(
7493
7657
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7658
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7495
7659
  ext_factor, attn_factor, beta_fast, beta_slow);
7496
7660
  cb(Qcur, "Qcur", il);
7497
7661
 
@@ -7500,7 +7664,7 @@ struct llm_build_context {
7500
7664
 
7501
7665
  Kcur = ggml_rope_custom(
7502
7666
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7667
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7504
7668
  ext_factor, attn_factor, beta_fast, beta_slow);
7505
7669
  cb(Kcur, "Kcur", il);
7506
7670
 
@@ -7551,33 +7715,181 @@ struct llm_build_context {
7551
7715
 
7552
7716
  return gf;
7553
7717
  }
7554
- };
7555
-
7556
- static struct ggml_cgraph * llama_build_graph(
7557
- llama_context & lctx,
7558
- const llama_batch & batch,
7559
- bool worst_case) {
7560
- const auto & model = lctx.model;
7561
7718
 
7562
- // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7563
- llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7564
- if (il >= 0) {
7565
- ggml_format_name(cur, "%s-%d", name, il);
7566
- } else {
7567
- ggml_set_name(cur, name);
7568
- }
7719
+ struct ggml_cgraph * build_starcoder2() {
7720
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7569
7721
 
7570
- if (!lctx.cparams.offload_kqv) {
7571
- if (strcmp(name, "kqv_merged_cont") == 0) {
7572
- // all nodes between the KV store and the attention output are run on the CPU
7573
- ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
7574
- }
7575
- }
7576
- };
7722
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7723
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7724
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7577
7725
 
7578
- struct ggml_cgraph * result = NULL;
7726
+ struct ggml_tensor * cur;
7727
+ struct ggml_tensor * inpL;
7579
7728
 
7580
- struct llm_build_context llm(lctx, batch, cb, worst_case);
7729
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7730
+ cb(inpL, "inp_embd", -1);
7731
+
7732
+ // inp_pos - contains the positions
7733
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7734
+ cb(inp_pos, "inp_pos", -1);
7735
+
7736
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7737
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7738
+ cb(KQ_mask, "KQ_mask", -1);
7739
+
7740
+ for (int il = 0; il < n_layer; ++il) {
7741
+ struct ggml_tensor * inpSA = inpL;
7742
+
7743
+ // norm
7744
+ cur = llm_build_norm(ctx0, inpL, hparams,
7745
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
7746
+ LLM_NORM, cb, il);
7747
+ cb(cur, "attn_norm", il);
7748
+
7749
+ // self-attention
7750
+ {
7751
+ // compute Q and K and RoPE them
7752
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7753
+ cb(Qcur, "Qcur", il);
7754
+ if (model.layers[il].bq) {
7755
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7756
+ cb(Qcur, "Qcur", il);
7757
+ }
7758
+
7759
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7760
+ cb(Kcur, "Kcur", il);
7761
+ if (model.layers[il].bk) {
7762
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7763
+ cb(Kcur, "Kcur", il);
7764
+ }
7765
+
7766
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7767
+ cb(Vcur, "Vcur", il);
7768
+ if (model.layers[il].bv) {
7769
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7770
+ cb(Vcur, "Vcur", il);
7771
+ }
7772
+
7773
+ Qcur = ggml_rope_custom(
7774
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7775
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7776
+ ext_factor, attn_factor, beta_fast, beta_slow
7777
+ );
7778
+ cb(Qcur, "Qcur", il);
7779
+
7780
+ Kcur = ggml_rope_custom(
7781
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7782
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7783
+ ext_factor, attn_factor, beta_fast, beta_slow
7784
+ );
7785
+ cb(Kcur, "Kcur", il);
7786
+
7787
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7788
+ model.layers[il].wo, model.layers[il].bo,
7789
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7790
+ cb(cur, "kqv_out", il);
7791
+ }
7792
+
7793
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7794
+ cb(ffn_inp, "ffn_inp", il);
7795
+
7796
+ // feed-forward network
7797
+
7798
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7799
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(cur, "ffn_norm", il);
7802
+
7803
+ cur = llm_build_ffn(ctx0, cur,
7804
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7805
+ NULL, NULL,
7806
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7807
+ NULL,
7808
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
7809
+ cb(cur, "ffn_out", il);
7810
+ cur = ggml_add(ctx0, cur, ffn_inp);
7811
+ cb(cur, "l_out", il);
7812
+
7813
+ // input for next layer
7814
+ inpL = cur;
7815
+ }
7816
+
7817
+ cur = inpL;
7818
+
7819
+ cur = llm_build_norm(ctx0, cur, hparams,
7820
+ model.output_norm, model.output_norm_b,
7821
+ LLM_NORM, cb, -1);
7822
+ cb(cur, "result_norm", -1);
7823
+
7824
+ // lm_head
7825
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7826
+ cb(cur, "result_output", -1);
7827
+
7828
+ ggml_build_forward_expand(gf, cur);
7829
+
7830
+ return gf;
7831
+ }
7832
+ };
7833
+
7834
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7835
+ llama_batch dummy;
7836
+ dummy.n_tokens = 0;
7837
+
7838
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7839
+
7840
+ struct llm_build_context llm(lctx, dummy, cb, false);
7841
+
7842
+ llm.init();
7843
+
7844
+ struct ggml_cgraph * result = llm.build_defrag(ids);
7845
+
7846
+ llm.free();
7847
+
7848
+ return result;
7849
+ }
7850
+
7851
+ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7852
+ llama_batch dummy;
7853
+ dummy.n_tokens = 0;
7854
+
7855
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7856
+
7857
+ struct llm_build_context llm(lctx, dummy, cb, false);
7858
+
7859
+ llm.init();
7860
+
7861
+ struct ggml_cgraph * result = llm.build_k_shift();
7862
+
7863
+ llm.free();
7864
+
7865
+ return result;
7866
+ }
7867
+
7868
+ static struct ggml_cgraph * llama_build_graph(
7869
+ llama_context & lctx,
7870
+ const llama_batch & batch,
7871
+ bool worst_case) {
7872
+ const auto & model = lctx.model;
7873
+
7874
+ // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7875
+ llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7876
+ if (il >= 0) {
7877
+ ggml_format_name(cur, "%s-%d", name, il);
7878
+ } else {
7879
+ ggml_set_name(cur, name);
7880
+ }
7881
+
7882
+ if (!lctx.cparams.offload_kqv) {
7883
+ if (strcmp(name, "kqv_merged_cont") == 0) {
7884
+ // all nodes between the KV store and the attention output are run on the CPU
7885
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
7886
+ }
7887
+ }
7888
+ };
7889
+
7890
+ struct ggml_cgraph * result = NULL;
7891
+
7892
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
7581
7893
 
7582
7894
  llm.init();
7583
7895
 
@@ -7663,6 +7975,10 @@ static struct ggml_cgraph * llama_build_graph(
7663
7975
  {
7664
7976
  result = llm.build_gemma();
7665
7977
  } break;
7978
+ case LLM_ARCH_STARCODER2:
7979
+ {
7980
+ result = llm.build_starcoder2();
7981
+ } break;
7666
7982
  default:
7667
7983
  GGML_ASSERT(false);
7668
7984
  }
@@ -7672,6 +7988,20 @@ static struct ggml_cgraph * llama_build_graph(
7672
7988
  return result;
7673
7989
  }
7674
7990
 
7991
+ static void llama_set_k_shift(llama_context & lctx) {
7992
+ const auto & cparams = lctx.cparams;
7993
+
7994
+ const int64_t n_ctx = cparams.n_ctx;
7995
+
7996
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7997
+
7998
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7999
+
8000
+ for (int i = 0; i < n_ctx; ++i) {
8001
+ data[i] = lctx.kv_self.cells[i].delta;
8002
+ }
8003
+ }
8004
+
7675
8005
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7676
8006
  //
7677
8007
  // set input data
@@ -7700,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7700
8030
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7701
8031
  }
7702
8032
 
7703
- {
8033
+ if (hparams.causal_attn) {
7704
8034
  const int64_t n_kv = kv_self.n;
7705
8035
  const int64_t n_tokens = batch.n_tokens;
7706
8036
 
@@ -7715,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7715
8045
 
7716
8046
  for (int i = 0; i < n_kv; ++i) {
7717
8047
  float f;
7718
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7719
- (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
8048
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7720
8049
  f = -INFINITY;
7721
8050
  } else {
7722
- f = 0;
8051
+ f = 0.0f;
7723
8052
  }
7724
8053
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7725
8054
  }
7726
8055
  }
7727
8056
  }
8057
+ } else {
8058
+ // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
8059
+ const int64_t n_tokens = batch.n_tokens;
8060
+
8061
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8062
+
8063
+ float * data = (float *) lctx.inp_KQ_mask->data;
8064
+
8065
+ for (int h = 0; h < 1; ++h) {
8066
+ for (int j = 0; j < n_tokens; ++j) {
8067
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8068
+
8069
+ for (int i = 0; i < n_tokens; ++i) {
8070
+ float f = -INFINITY;
8071
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
8072
+ if (batch.seq_id[i][s] == seq_id) {
8073
+ f = 0.0f;
8074
+ break;
8075
+ }
8076
+ }
8077
+
8078
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
8079
+ }
8080
+ }
8081
+ }
7728
8082
  }
7729
8083
 
7730
8084
  if (hparams.need_kq_pos) {
@@ -7739,29 +8093,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7739
8093
  }
7740
8094
  }
7741
8095
 
7742
- if (kv_self.has_shift) {
7743
- const int64_t n_ctx = cparams.n_ctx;
7744
-
7745
- assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7746
-
7747
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7748
-
7749
- for (int i = 0; i < n_ctx; ++i) {
7750
- data[i] = lctx.kv_self.cells[i].delta;
7751
- }
7752
- }
7753
-
7754
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
8096
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7755
8097
  const int64_t n_tokens = batch.n_tokens;
7756
8098
 
7757
8099
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7758
- float * data = (float *) lctx.inp_mean->data;
7759
8100
 
8101
+ float * data = (float *) lctx.inp_mean->data;
7760
8102
  memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7761
8103
 
7762
8104
  std::vector<uint64_t> sum(n_tokens, 0);
7763
8105
  for (int i = 0; i < n_tokens; ++i) {
7764
8106
  const llama_seq_id seq_id = batch.seq_id[i][0];
8107
+
8108
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
8109
+
7765
8110
  sum[seq_id] += 1;
7766
8111
  }
7767
8112
 
@@ -7779,15 +8124,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7779
8124
  }
7780
8125
  }
7781
8126
 
7782
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
8127
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7783
8128
  const int64_t n_tokens = batch.n_tokens;
7784
8129
 
7785
8130
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
8131
+
7786
8132
  uint32_t * data = (uint32_t *) lctx.inp_cls->data;
8133
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
7787
8134
 
7788
8135
  for (int i = 0; i < n_tokens; ++i) {
7789
8136
  const llama_seq_id seq_id = batch.seq_id[i][0];
7790
- const llama_pos pos = batch.pos[i];
8137
+ const llama_pos pos = batch.pos[i];
8138
+
8139
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
8140
+
7791
8141
  if (pos == 0) {
7792
8142
  data[seq_id] = i;
7793
8143
  }
@@ -7795,6 +8145,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
8145
  }
7796
8146
  }
7797
8147
 
8148
+ static void llama_graph_compute(
8149
+ llama_context & lctx,
8150
+ ggml_cgraph * gf,
8151
+ int n_threads) {
8152
+ #ifdef GGML_USE_MPI
8153
+ const int64_t n_layer = lctx.model.hparams.n_layer;
8154
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
8155
+ #endif
8156
+
8157
+ #ifdef GGML_USE_METAL
8158
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
8159
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
8160
+ }
8161
+ #endif
8162
+
8163
+ if (lctx.backend_cpu != nullptr) {
8164
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
8165
+ ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
8166
+ }
8167
+
8168
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
8169
+
8170
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
8171
+
8172
+ #ifdef GGML_USE_MPI
8173
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
8174
+ #endif
8175
+ }
8176
+
7798
8177
  // decode a batch of tokens by evaluating the transformer
7799
8178
  //
7800
8179
  // - lctx: llama context
@@ -7821,9 +8200,9 @@ static int llama_decode_internal(
7821
8200
  const auto n_batch = cparams.n_batch;
7822
8201
 
7823
8202
  GGML_ASSERT(n_tokens <= n_batch);
8203
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7824
8204
 
7825
8205
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
7826
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7827
8206
 
7828
8207
  const int64_t t_start_us = ggml_time_us();
7829
8208
 
@@ -7872,21 +8251,26 @@ static int llama_decode_internal(
7872
8251
  batch.seq_id = seq_id_arr.data();
7873
8252
  }
7874
8253
 
7875
- // if we have enough unused cells before the current head ->
7876
- // better to start searching from the beginning of the cache, hoping to fill it
7877
- if (kv_self.head > kv_self.used + 2*n_tokens) {
7878
- kv_self.head = 0;
7879
- }
8254
+ // non-causal masks do not use the KV cache
8255
+ if (hparams.causal_attn) {
8256
+ llama_kv_cache_update(&lctx);
7880
8257
 
7881
- if (!llama_kv_cache_find_slot(kv_self, batch)) {
7882
- return 1;
7883
- }
8258
+ // if we have enough unused cells before the current head ->
8259
+ // better to start searching from the beginning of the cache, hoping to fill it
8260
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
8261
+ kv_self.head = 0;
8262
+ }
7884
8263
 
7885
- // a heuristic, to avoid attending the full cache if it is not yet utilized
7886
- // after enough generations, the benefit from this heuristic disappears
7887
- // if we start defragmenting the cache, the benefit from this will be more important
7888
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
7889
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8264
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
8265
+ return 1;
8266
+ }
8267
+
8268
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8269
+ // after enough generations, the benefit from this heuristic disappears
8270
+ // if we start defragmenting the cache, the benefit from this will be more important
8271
+ kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
8272
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8273
+ }
7890
8274
 
7891
8275
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
7892
8276
 
@@ -7896,19 +8280,26 @@ static int llama_decode_internal(
7896
8280
  ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7897
8281
 
7898
8282
  // the output is always the last tensor in the graph
7899
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7900
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7901
- if (strcmp(res->name, "result_output") == 0) {
7902
- // the embeddings could be the second to last tensor, or the third to last tensor
7903
- if (strcmp(embeddings->name, "result_norm") != 0) {
7904
- embeddings = gf->nodes[gf->n_nodes - 3];
7905
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7906
- }
7907
- } else if (strcmp(res->name, "result_embd") == 0) {
7908
- embeddings = res;
7909
- res = nullptr;
8283
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8284
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8285
+
8286
+ if (!hparams.causal_attn) {
8287
+ res = nullptr; // do not extract logits for embedding models such as BERT
8288
+
8289
+ // token or sequence embeddings
8290
+ embd = gf->nodes[gf->n_nodes - 1];
8291
+
8292
+ GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
7910
8293
  } else {
7911
- GGML_ASSERT(false);
8294
+ if (strcmp(res->name, "result_output") == 0) {
8295
+ // the token embeddings could be the second to last tensor, or the third to last tensor
8296
+ if (strcmp(embd->name, "result_norm") != 0) {
8297
+ embd = gf->nodes[gf->n_nodes - 3];
8298
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8299
+ }
8300
+ } else {
8301
+ GGML_ASSERT(false && "missing result_output tensor");
8302
+ }
7912
8303
  }
7913
8304
 
7914
8305
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7924,40 +8315,12 @@ static int llama_decode_internal(
7924
8315
  n_threads = std::min(4, n_threads);
7925
8316
  }
7926
8317
 
7927
- #ifdef GGML_USE_MPI
7928
- const int64_t n_layer = hparams.n_layer;
7929
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7930
- #endif
7931
-
7932
- #ifdef GGML_USE_METAL
7933
- if (ggml_backend_is_metal(lctx.backend_metal)) {
7934
- ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7935
- }
7936
- #endif
7937
-
7938
- if (lctx.backend_cpu != nullptr) {
7939
- ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7940
- }
7941
-
7942
8318
  llama_set_inputs(lctx, batch);
7943
8319
 
7944
- ggml_backend_sched_graph_compute(lctx.sched, gf);
7945
-
7946
- // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7947
-
7948
- #ifdef GGML_USE_MPI
7949
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7950
- #endif
8320
+ llama_graph_compute(lctx, gf, n_threads);
7951
8321
 
7952
8322
  // update the kv ring buffer
7953
8323
  {
7954
- if (kv_self.has_shift) {
7955
- kv_self.has_shift = false;
7956
- for (uint32_t i = 0; i < kv_self.size; ++i) {
7957
- kv_self.cells[i].delta = 0;
7958
- }
7959
- }
7960
-
7961
8324
  kv_self.head += n_tokens;
7962
8325
 
7963
8326
  // Ensure kv cache head points to a valid index.
@@ -7966,6 +8329,18 @@ static int llama_decode_internal(
7966
8329
  }
7967
8330
  }
7968
8331
 
8332
+ // decide if we need to defrag the kv cache
8333
+ if (cparams.defrag_thold >= 0.0f) {
8334
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8335
+
8336
+ // queue defragmentation for next llama_kv_cache_update
8337
+ if (fragmentation > cparams.defrag_thold) {
8338
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8339
+
8340
+ llama_kv_cache_defrag(kv_self);
8341
+ }
8342
+ }
8343
+
7969
8344
  #ifdef GGML_PERF
7970
8345
  // print timing information per ggml operation (for debugging purposes)
7971
8346
  // requires GGML_PERF to be defined
@@ -7991,66 +8366,341 @@ static int llama_decode_internal(
7991
8366
  logits_out.clear();
7992
8367
  #endif
7993
8368
 
7994
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
7995
- GGML_ASSERT(res_backend != nullptr);
8369
+ ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
8370
+ GGML_ASSERT(backend_res != nullptr);
8371
+
7996
8372
  if (batch.logits) {
7997
8373
  logits_out.resize(n_vocab * n_tokens);
7998
8374
  for (uint32_t i = 0; i < n_tokens; i++) {
7999
8375
  if (batch.logits[i] == 0) {
8000
8376
  continue;
8001
8377
  }
8002
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8378
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8003
8379
  #ifndef NDEBUG
8004
8380
  logits_valid[i] = true;
8005
8381
  #endif
8006
8382
  }
8007
8383
  } else if (lctx.logits_all) {
8008
8384
  logits_out.resize(n_vocab * n_tokens);
8009
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8385
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8010
8386
  #ifndef NDEBUG
8011
8387
  std::fill(logits_valid.begin(), logits_valid.end(), true);
8012
8388
  #endif
8013
8389
  } else {
8014
8390
  logits_out.resize(n_vocab);
8015
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8391
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8016
8392
  #ifndef NDEBUG
8017
8393
  logits_valid[0] = true;
8018
8394
  #endif
8019
8395
  }
8020
- ggml_backend_synchronize(res_backend);
8021
- }
8396
+ ggml_backend_synchronize(backend_res);
8397
+ }
8398
+
8399
+ // extract embeddings
8400
+ if (cparams.embeddings && embd) {
8401
+ ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
8402
+ GGML_ASSERT(backend_embd != nullptr);
8403
+
8404
+ switch (cparams.pooling_type) {
8405
+ case LLAMA_POOLING_TYPE_NONE:
8406
+ {
8407
+ // extract token embeddings
8408
+ auto & embd_out = lctx.embd;
8409
+
8410
+ if (batch.logits) {
8411
+ embd_out.resize(n_embd * n_tokens);
8412
+ for (uint32_t i = 0; i < n_tokens; i++) {
8413
+ if (batch.logits[i] == 0) {
8414
+ continue;
8415
+ }
8416
+
8417
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
8418
+ }
8419
+ }
8420
+ } break;
8421
+ case LLAMA_POOLING_TYPE_CLS:
8422
+ case LLAMA_POOLING_TYPE_MEAN:
8423
+ {
8424
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
8425
+
8426
+ // extract sequence embeddings
8427
+ auto & embd_seq_out = lctx.embd_seq;
8428
+ embd_seq_out.clear();
8429
+
8430
+ for (uint32_t i = 0; i < n_tokens; i++) {
8431
+ const llama_seq_id seq_id = batch.seq_id[i][0];
8432
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
8433
+ continue;
8434
+ }
8435
+ embd_seq_out[seq_id].resize(n_embd);
8436
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
8437
+ }
8438
+ } break;
8439
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
8440
+ {
8441
+ GGML_ASSERT(false && "unknown pooling type");
8442
+ } break;
8443
+ }
8444
+ ggml_backend_synchronize(backend_embd);
8445
+ }
8446
+
8447
+ // measure the performance only for the single-token evals
8448
+ if (n_tokens == 1) {
8449
+ lctx.t_eval_us += ggml_time_us() - t_start_us;
8450
+ lctx.n_eval++;
8451
+ }
8452
+ else if (n_tokens > 1) {
8453
+ lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8454
+ lctx.n_p_eval += n_tokens;
8455
+ }
8456
+
8457
+ // get a more accurate load time, upon first eval
8458
+ // TODO: fix this
8459
+ if (!lctx.has_evaluated_once) {
8460
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8461
+ lctx.has_evaluated_once = true;
8462
+ }
8463
+
8464
+ return 0;
8465
+ }
8466
+
8467
+ // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8468
+ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8469
+ auto & kv_self = lctx.kv_self;
8470
+
8471
+ const auto & hparams = lctx.model.hparams;
8472
+
8473
+ const uint32_t n_layer = hparams.n_layer;
8474
+
8475
+ const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8476
+ const uint32_t n_used = kv_self.used;
8477
+
8478
+ assert(n_used <= n_kv);
8479
+
8480
+ //const int64_t t_start = ggml_time_us();
8481
+
8482
+ // number of cells moved
8483
+ uint32_t n_moves = 0;
8484
+
8485
+ // determine which KV cells to move where
8486
+ //
8487
+ // cell i moves to ids[i]
8488
+ //
8489
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
8490
+ //
8491
+ std::vector<uint32_t> ids(n_kv, n_kv);
8492
+
8493
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
8494
+ const auto & cell0 = kv_self.cells[i0];
8495
+
8496
+ if (!cell0.is_empty()) {
8497
+ ids[i0] = i0;
8498
+
8499
+ continue;
8500
+ }
8501
+
8502
+ // found a hole - fill it with data from the end of the cache
8503
+
8504
+ uint32_t nh = 1;
8505
+
8506
+ // determine the size of the hole
8507
+ while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
8508
+ nh++;
8509
+ }
8510
+
8511
+ // each move requires 6*n_layer tensors (see build_defrag)
8512
+ // - source view, destination view, copy operation
8513
+ // - x2 for keys and values
8514
+ //
8515
+ if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8516
+ // the graph is too big, we cannot move more cells
8517
+ break;
8518
+ }
8519
+
8520
+ uint32_t nf = 0;
8521
+ uint32_t is = n_kv - 1;
8522
+
8523
+ // starting from the end, find nh non-empty cells
8524
+ for (; is > i0; --is) {
8525
+ const auto & cell1 = kv_self.cells[is];
8526
+
8527
+ if (cell1.is_empty() || ids[is] != n_kv) {
8528
+ continue;
8529
+ }
8530
+
8531
+ // non-empty cell which is not yet moved
8532
+ nf++;
8533
+
8534
+ if (nf == nh) {
8535
+ break;
8536
+ }
8537
+ }
8538
+
8539
+ // this can only happen if `n_used` is not accurate, which would be a bug
8540
+ GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
8541
+
8542
+ nf = 0;
8543
+
8544
+ uint32_t i1 = is;
8545
+
8546
+ // are we moving a continuous block of memory?
8547
+ bool cont = false;
8548
+
8549
+ // go back and move the nf cells to the hole
8550
+ for (; i1 < n_kv; ++i1) {
8551
+ auto & cell1 = kv_self.cells[i1];
8552
+
8553
+ if (cell1.is_empty() || ids[i1] != n_kv) {
8554
+ cont = false;
8555
+ continue;
8556
+ }
8557
+
8558
+ // this cell goes to (i0 + nf)
8559
+ ids[i1] = i0 + nf;
8560
+
8561
+ // move the cell meta data
8562
+ kv_self.cells[i0 + nf] = cell1;
8563
+
8564
+ // clear the old cell and move the head there
8565
+ cell1 = llama_kv_cell();
8566
+ kv_self.head = n_used;
8567
+
8568
+ if (!cont) {
8569
+ n_moves++;
8570
+ cont = true;
8571
+ }
8572
+
8573
+ nf++;
8574
+
8575
+ if (nf == nh) {
8576
+ break;
8577
+ }
8578
+ }
8579
+
8580
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8581
+
8582
+ i0 += nh - 1;
8583
+ }
8584
+
8585
+ if (n_moves == 0) {
8586
+ return;
8587
+ }
8588
+
8589
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8590
+
8591
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
8592
+
8593
+ #if 0
8594
+ // CPU defrag
8595
+ //
8596
+ // TODO: optimizations are possible:
8597
+ // - multiple threads
8598
+ // - avoid copying to the host memory when already there
8599
+ //
8600
+ // likely not worth the effort, as we have ggml_graph based defrag
8601
+ //
8602
+
8603
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8604
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8605
+
8606
+ const uint32_t kv_size = kv_self.size;
8607
+
8608
+ std::vector<uint8_t> buf_k;
8609
+ std::vector<uint8_t> buf_v;
8610
+
8611
+ for (uint32_t il = 0; il < n_layer; ++il) {
8612
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
8613
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
8614
+
8615
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
8616
+ const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
8617
+
8618
+ buf_k.resize(k_size);
8619
+ buf_v.resize(v_size);
8620
+
8621
+ ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8622
+ ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8623
+
8624
+ // batch move [i, i+nm) to [id, id+nm)
8625
+ // note: cells can move only to a lower index
8626
+ for (uint32_t i = 0; i < n_kv; ++i) {
8627
+ const uint32_t id = ids[i];
8628
+
8629
+ if (i == id || id == n_kv) {
8630
+ continue;
8631
+ }
8632
+
8633
+ uint32_t nm = 1;
8634
+
8635
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
8636
+ nm++;
8637
+ }
8638
+
8639
+ // move keys
8640
+ {
8641
+ const int64_t os = i*k_size_row;
8642
+ const int64_t od = id*k_size_row;
8643
+
8644
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
8645
+ }
8646
+
8647
+ // move values (note: they are transposed)
8648
+ {
8649
+ const int64_t os = i;
8650
+ const int64_t od = id;
8651
+
8652
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
8653
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
8654
+ }
8655
+ }
8656
+
8657
+ i += nm - 1;
8658
+ }
8659
+
8660
+ ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8661
+ ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8662
+ }
8663
+ #else
8664
+ // ggml_graph defrag
8665
+
8666
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8667
+
8668
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8669
+ #endif
8670
+
8671
+ //const int64_t t_end = ggml_time_us();
8672
+
8673
+ //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8674
+ }
8675
+
8676
+ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8677
+ // apply K-shift if needed
8678
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8679
+ llama_set_k_shift(lctx);
8680
+
8681
+ {
8682
+ ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
8683
+
8684
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8685
+ }
8022
8686
 
8023
- // extract embeddings
8024
- if (!lctx.embedding.empty()) {
8025
- auto & embedding_out = lctx.embedding;
8687
+ {
8688
+ auto & kv_self = lctx.kv_self;
8026
8689
 
8027
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8028
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8690
+ kv_self.has_shift = false;
8029
8691
 
8030
- embedding_out.resize(embd_size);
8031
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8032
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8033
- ggml_backend_synchronize(embeddings_backend);
8692
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
8693
+ kv_self.cells[i].delta = 0;
8694
+ }
8695
+ }
8034
8696
  }
8035
8697
 
8036
- // measure the performance only for the single-token evals
8037
- if (n_tokens == 1) {
8038
- lctx.t_eval_us += ggml_time_us() - t_start_us;
8039
- lctx.n_eval++;
8040
- }
8041
- else if (n_tokens > 1) {
8042
- lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8043
- lctx.n_p_eval += n_tokens;
8044
- }
8698
+ // defragment the KV cache if needed
8699
+ if (lctx.kv_self.do_defrag) {
8700
+ llama_kv_cache_defrag_internal(lctx);
8045
8701
 
8046
- // get a more accurate load time, upon first eval
8047
- // TODO: fix this
8048
- if (!lctx.has_evaluated_once) {
8049
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8050
- lctx.has_evaluated_once = true;
8702
+ lctx.kv_self.do_defrag = false;
8051
8703
  }
8052
-
8053
- return 0;
8054
8704
  }
8055
8705
 
8056
8706
  //
@@ -8085,19 +8735,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
8085
8735
  GGML_ASSERT(llama_is_byte_token(vocab, id));
8086
8736
  const auto& token_data = vocab.id_to_token.at(id);
8087
8737
  switch (llama_vocab_get_type(vocab)) {
8088
- case LLAMA_VOCAB_TYPE_SPM: {
8089
- auto buf = token_data.text.substr(3, 2);
8090
- return strtol(buf.c_str(), NULL, 16);
8091
- }
8092
- case LLAMA_VOCAB_TYPE_BPE: {
8093
- GGML_ASSERT(false);
8094
- return unicode_to_bytes_bpe(token_data.text);
8095
- }
8096
- case LLAMA_VOCAB_TYPE_WPM: {
8097
- GGML_ASSERT(false);
8098
- }
8099
- default:
8100
- GGML_ASSERT(false);
8738
+ case LLAMA_VOCAB_TYPE_SPM: {
8739
+ auto buf = token_data.text.substr(3, 2);
8740
+ return strtol(buf.c_str(), NULL, 16);
8741
+ }
8742
+ case LLAMA_VOCAB_TYPE_BPE: {
8743
+ GGML_ASSERT(false);
8744
+ return unicode_to_bytes_bpe(token_data.text);
8745
+ }
8746
+ case LLAMA_VOCAB_TYPE_WPM: {
8747
+ GGML_ASSERT(false);
8748
+ }
8749
+ default:
8750
+ GGML_ASSERT(false);
8101
8751
  }
8102
8752
  }
8103
8753
 
@@ -8644,37 +9294,46 @@ struct llm_tokenizer_wpm {
8644
9294
  }
8645
9295
 
8646
9296
  std::vector<std::string> preprocess(const std::string & text) {
8647
- std::string ori_str = normalize(text);
8648
- uint64_t ori_size = ori_str.size();
9297
+ // normalalization form D
9298
+ std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
9299
+ std::vector<uint32_t> nfd_codepoints;
9300
+ for (uint32_t code : codepoints) {
9301
+ auto it = nfd_map.equal_range(code);
9302
+ if (it.first != it.second) {
9303
+ for (auto jt = it.first; jt != it.second; jt++) {
9304
+ nfd_codepoints.push_back(jt->second);
9305
+ }
9306
+ } else {
9307
+ nfd_codepoints.push_back(code);
9308
+ }
9309
+ }
8649
9310
 
8650
- // single punct / single symbol / single digit
8651
- // baseline: add whitespace on the left and right of punct and chinese characters
8652
- std::vector<std::string> words;
9311
+ // strip accents, strip control, uniformize whitespace,
9312
+ // to lowercase, pad chinese characters, pad punctuation
8653
9313
  std::string new_str = "";
8654
- uint64_t i = 0;
8655
- while (i < ori_size) {
8656
- int utf_char_len = utf8_len(ori_str[i]);
8657
- if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8658
- new_str += " ";
8659
- new_str += ori_str[i];
8660
- new_str += " ";
8661
- i += 1;
9314
+ for (uint32_t code : nfd_codepoints) {
9315
+ int type = codepoint_type(code);
9316
+ if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
9317
+ continue;
9318
+ }
9319
+ code = to_lower(code);
9320
+ if (type == CODEPOINT_TYPE_WHITESPACE) {
9321
+ code = ' ';
8662
9322
  }
8663
- else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
9323
+ std::string s = codepoint_to_utf8(code);
9324
+ if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8664
9325
  new_str += " ";
8665
- new_str += ori_str.substr(i, 3);
9326
+ new_str += s;
8666
9327
  new_str += " ";
8667
- i += 3;
8668
- }
8669
- else {
8670
- new_str += ori_str[i];
8671
- i += 1;
9328
+ } else {
9329
+ new_str += s;
8672
9330
  }
8673
9331
  }
8674
9332
 
8675
9333
  // split by whitespace
8676
9334
  uint64_t l = 0;
8677
9335
  uint64_t r = 0;
9336
+ std::vector<std::string> words;
8678
9337
  while (r < new_str.size()) {
8679
9338
  // if is whitespace
8680
9339
  if (isspace(new_str[r])) {
@@ -8692,47 +9351,21 @@ struct llm_tokenizer_wpm {
8692
9351
  return words;
8693
9352
  }
8694
9353
 
8695
- std::string normalize(const std::string & text) {
8696
- // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8697
- std::string text2 = strip_accents(text);
8698
- for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8699
- char c = text2[i];
8700
- if (c >= 'A' && c <= 'Z') {
8701
- text2[i] = c - 'A' + 'a';
8702
- }
9354
+ uint32_t to_lower(uint32_t code) {
9355
+ static const std::locale locale("en_US.UTF-8");
9356
+ #if defined(_WIN32)
9357
+ if (code > 0xFFFF) {
9358
+ return code;
8703
9359
  }
8704
- return text2;
9360
+ #endif
9361
+ return std::tolower(wchar_t(code), locale);
8705
9362
  }
8706
9363
 
8707
- bool is_chinese_char(const std::string & str) {
8708
- int len = str.length();
8709
- unsigned int codepoint = 0;
8710
- int num_bytes = 0;
8711
- int i = 0;
8712
- unsigned char ch = static_cast<unsigned char>(str[i]);
8713
- if (ch <= 0x7f) {
8714
- codepoint = ch;
8715
- num_bytes = 1;
8716
- } else if ((ch >> 5) == 0x06) {
8717
- codepoint = ch & 0x1f;
8718
- num_bytes = 2;
8719
- } else if ((ch >> 4) == 0x0e) {
8720
- codepoint = ch & 0x0f;
8721
- num_bytes = 3;
8722
- } else if ((ch >> 3) == 0x1e) {
8723
- codepoint = ch & 0x07;
8724
- num_bytes = 4;
8725
- }
8726
- for (int j = 1; j < num_bytes; ++j) {
8727
- if (i + j >= len) {
8728
- return false; // incomplete UTF-8 character
8729
- }
8730
- unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8731
- if ((next_ch >> 6) != 0x02) {
8732
- return false; // invalid trailing byte
8733
- }
8734
- codepoint = (codepoint << 6) | (next_ch & 0x3f);
8735
- }
9364
+ bool is_ascii_punct(uint32_t code) {
9365
+ return code < 256 && ispunct(code);
9366
+ }
9367
+
9368
+ bool is_chinese_char(uint32_t codepoint) {
8736
9369
  if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8737
9370
  (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8738
9371
  (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@@ -8748,41 +9381,6 @@ struct llm_tokenizer_wpm {
8748
9381
  return false;
8749
9382
  }
8750
9383
 
8751
- std::string strip_accents(const std::string & input_string) {
8752
- std::string resultString;
8753
- std::map<std::string, char> accent_map = {
8754
- {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8755
- {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8756
- {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8757
- {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8758
- {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8759
- {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8760
- {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8761
- {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8762
- {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8763
- };
8764
-
8765
- for (size_t i = 0; i < input_string.length();) {
8766
- int len = utf8_len(input_string[i]);
8767
- std::string curChar = input_string.substr(i, len);
8768
- auto iter = accent_map.find(curChar);
8769
- if (iter != accent_map.end()) {
8770
- resultString += iter->second;
8771
- } else {
8772
- resultString += curChar;
8773
- }
8774
- i += len;
8775
- }
8776
-
8777
- return resultString;
8778
- }
8779
-
8780
- static size_t utf8_len(char src) {
8781
- const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8782
- uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8783
- return lookup[highbits];
8784
- }
8785
-
8786
9384
  const llama_vocab & vocab;
8787
9385
  };
8788
9386
 
@@ -9816,10 +10414,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
9816
10414
  }
9817
10415
  }
9818
10416
 
9819
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
9820
- llama_sample_temp(ctx, candidates_p, temp);
9821
- }
9822
-
9823
10417
  void llama_sample_repetition_penalties(
9824
10418
  struct llama_context * ctx,
9825
10419
  llama_token_data_array * candidates,
@@ -9946,38 +10540,6 @@ void llama_sample_apply_guidance(
9946
10540
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9947
10541
  }
9948
10542
 
9949
- void llama_sample_classifier_free_guidance(
9950
- struct llama_context * ctx,
9951
- llama_token_data_array * candidates,
9952
- struct llama_context * guidance_ctx,
9953
- float scale) {
9954
- GGML_ASSERT(ctx);
9955
- int64_t t_start_sample_us;
9956
-
9957
- t_start_sample_us = ggml_time_us();
9958
- const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
9959
-
9960
- GGML_ASSERT(n_vocab == candidates->size);
9961
- GGML_ASSERT(!candidates->sorted);
9962
-
9963
- std::vector<float> logits_base(n_vocab);
9964
- for (size_t i = 0; i < n_vocab; ++i) {
9965
- logits_base[i] = candidates->data[i].logit;
9966
- }
9967
-
9968
- float * logits_guidance = llama_get_logits(guidance_ctx);
9969
-
9970
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9971
- llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
9972
- t_start_sample_us = ggml_time_us();
9973
-
9974
- for (size_t i = 0; i < n_vocab; ++i) {
9975
- candidates->data[i].logit = logits_base[i];
9976
- }
9977
-
9978
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9979
- }
9980
-
9981
10543
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
9982
10544
  GGML_ASSERT(ctx);
9983
10545
 
@@ -10411,7 +10973,7 @@ struct quantize_state_internal {
10411
10973
  {}
10412
10974
  };
10413
10975
 
10414
- static void llama_convert_tensor_internal(
10976
+ static void llama_tensor_dequantize_internal(
10415
10977
  struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
10416
10978
  const size_t nelements, const int nthread
10417
10979
  ) {
@@ -10508,31 +11070,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10508
11070
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10509
11071
  new_type = GGML_TYPE_Q8_0;
10510
11072
  }
10511
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11073
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11074
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10512
11075
  new_type = GGML_TYPE_Q5_K;
10513
11076
  }
10514
11077
  else if (new_type != GGML_TYPE_Q8_0) {
10515
11078
  new_type = GGML_TYPE_Q6_K;
10516
11079
  }
10517
11080
  } else if (name == "token_embd.weight") {
10518
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11081
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11082
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10519
11083
  new_type = GGML_TYPE_Q2_K;
10520
11084
  }
11085
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11086
+ new_type = GGML_TYPE_IQ3_S;
11087
+ }
10521
11088
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10522
- new_type = GGML_TYPE_Q4_K;
11089
+ new_type = GGML_TYPE_IQ3_S;
10523
11090
  }
10524
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11091
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11092
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10525
11093
  if (name.find("attn_v.weight") != std::string::npos) {
10526
11094
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10527
- else new_type = GGML_TYPE_Q2_K;
11095
+ else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10528
11096
  ++qs.i_attention_wv;
10529
11097
  }
11098
+ else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
11099
+ new_type = GGML_TYPE_Q4_K;
11100
+ }
10530
11101
  else if (name.find("ffn_down") != std::string::npos) {
10531
- if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
11102
+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
11103
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
11104
+ }
10532
11105
  ++qs.i_ffn_down;
10533
11106
  }
10534
11107
  else if (name.find("attn_output.weight") != std::string::npos) {
10535
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
11108
+ if (qs.model.hparams.n_expert == 8) {
11109
+ new_type = GGML_TYPE_Q5_K;
11110
+ } else {
11111
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
11112
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
11113
+ }
10536
11114
  }
10537
11115
  } else if (name.find("attn_v.weight") != std::string::npos) {
10538
11116
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
@@ -10542,13 +11120,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10542
11120
  new_type = GGML_TYPE_Q4_K;
10543
11121
  }
10544
11122
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10545
- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
11123
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
11124
+ }
11125
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
11126
+ new_type = GGML_TYPE_Q4_K;
11127
+ }
11128
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11129
+ new_type = GGML_TYPE_Q4_K;
11130
+ }
11131
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
11132
+ new_type = GGML_TYPE_Q4_K;
11133
+ }
11134
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11135
+ new_type = GGML_TYPE_Q4_K;
10546
11136
  }
10547
11137
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10548
11138
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10549
11139
  }
10550
11140
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
11141
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
10552
11142
  new_type = GGML_TYPE_Q5_K;
10553
11143
  }
10554
11144
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -10574,14 +11164,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10574
11164
  // TODO: explore better strategies
10575
11165
  new_type = GGML_TYPE_Q8_0;
10576
11166
  }
10577
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10578
- new_type = GGML_TYPE_Q2_K;
11167
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
11168
+ new_type = GGML_TYPE_IQ3_XXS;
11169
+ }
11170
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11171
+ new_type = GGML_TYPE_IQ2_S;
11172
+ }
11173
+ } else if (name.find("attn_q.weight") != std::string::npos) {
11174
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
11175
+ new_type = GGML_TYPE_IQ3_XXS;
11176
+ }
11177
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11178
+ new_type = GGML_TYPE_IQ2_S;
10579
11179
  }
10580
11180
  } else if (name.find("ffn_down") != std::string::npos) {
10581
11181
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10582
11182
  int i_layer = info.first, n_layer = info.second;
10583
11183
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10584
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
11184
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
10585
11185
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10586
11186
  }
10587
11187
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
@@ -10592,6 +11192,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10592
11192
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10593
11193
  : GGML_TYPE_Q3_K;
10594
11194
  }
11195
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
11196
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
11197
+ new_type = GGML_TYPE_Q4_K;
11198
+ }
10595
11199
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10596
11200
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10597
11201
  }
@@ -10603,8 +11207,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10603
11207
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10604
11208
  }
10605
11209
  }
10606
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
- if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
11210
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
11211
+ new_type = GGML_TYPE_Q5_K;
10608
11212
  }
10609
11213
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10610
11214
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@@ -10621,39 +11225,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10621
11225
  } else if (name.find("attn_output.weight") != std::string::npos) {
10622
11226
  if (arch != LLM_ARCH_FALCON) {
10623
11227
  if (qs.model.hparams.n_expert == 8) {
10624
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11228
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10625
11229
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10626
- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
11230
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
11231
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
10627
11232
  new_type = GGML_TYPE_Q5_K;
10628
11233
  }
10629
11234
  } else {
10630
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10631
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10632
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
10633
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
11235
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
11236
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
11237
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
11238
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
11239
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
10634
11240
  }
10635
11241
  } else {
10636
11242
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10637
11243
  }
10638
11244
  }
10639
11245
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10640
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
11246
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11247
+ new_type = GGML_TYPE_Q4_K;
11248
+ }
10641
11249
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10642
11250
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10643
11251
  }
10644
11252
  else if (name.find("ffn_gate") != std::string::npos) {
10645
11253
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10646
11254
  int i_layer = info.first, n_layer = info.second;
10647
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10648
- new_type = GGML_TYPE_Q2_K;
11255
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
11256
+ new_type = GGML_TYPE_IQ3_XXS;
10649
11257
  }
10650
11258
  ++qs.i_ffn_gate;
10651
11259
  }
10652
11260
  else if (name.find("ffn_up") != std::string::npos) {
10653
11261
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10654
11262
  int i_layer = info.first, n_layer = info.second;
10655
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10656
- new_type = GGML_TYPE_Q2_K;
11263
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
11264
+ new_type = GGML_TYPE_IQ3_XXS;
10657
11265
  }
10658
11266
  ++qs.i_ffn_up;
10659
11267
  }
@@ -10671,9 +11279,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10671
11279
  //}
10672
11280
  bool convert_incompatible_tensor = false;
10673
11281
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10674
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10675
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10676
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11282
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
11283
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
11284
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
10677
11285
  int nx = tensor->ne[0];
10678
11286
  int ny = tensor->ne[1];
10679
11287
  if (nx % QK_K != 0) {
@@ -10687,13 +11295,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10687
11295
  switch (new_type) {
10688
11296
  case GGML_TYPE_IQ2_XXS:
10689
11297
  case GGML_TYPE_IQ2_XS:
11298
+ case GGML_TYPE_IQ2_S:
10690
11299
  case GGML_TYPE_IQ3_XXS:
11300
+ case GGML_TYPE_IQ3_S:
10691
11301
  case GGML_TYPE_IQ1_S:
10692
11302
  case GGML_TYPE_Q2_K:
10693
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10694
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10695
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10696
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
11303
+ case GGML_TYPE_Q3_K:
11304
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
11305
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
11306
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
11307
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10697
11308
  default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
10698
11309
  }
10699
11310
  LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
@@ -10703,6 +11314,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10703
11314
  return new_type;
10704
11315
  }
10705
11316
 
11317
+ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
11318
+ std::mutex mutex;
11319
+ int counter = 0;
11320
+ size_t new_size = 0;
11321
+ if (nthread < 2) {
11322
+ // single-thread
11323
+ return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
11324
+ }
11325
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
11326
+ nrows, n_per_row, imatrix]() {
11327
+ std::array<int64_t, 1 << 4> local_hist = {};
11328
+ const int nrows_per_chunk = chunk_size / n_per_row;
11329
+ size_t local_size = 0;
11330
+ while (true) {
11331
+ std::unique_lock<std::mutex> lock(mutex);
11332
+ int first_row = counter; counter += nrows_per_chunk;
11333
+ if (first_row >= nrows) {
11334
+ if (local_size > 0) {
11335
+ for (int j=0; j<int(local_hist.size()); ++j) {
11336
+ hist_cur[j] += local_hist[j];
11337
+ }
11338
+ new_size += local_size;
11339
+ }
11340
+ break;
11341
+ }
11342
+ lock.unlock();
11343
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
11344
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
11345
+ first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
11346
+ }
11347
+ };
11348
+ for (int it = 0; it < nthread - 1; ++it) {
11349
+ workers.emplace_back(compute);
11350
+ }
11351
+ compute();
11352
+ for (auto & w : workers) { w.join(); }
11353
+ workers.clear();
11354
+ return new_size;
11355
+ }
11356
+
10706
11357
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
10707
11358
  ggml_type quantized_type;
10708
11359
  llama_ftype ftype = params->ftype;
@@ -10719,7 +11370,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10719
11370
  // K-quants
10720
11371
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10721
11372
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10722
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
11373
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
10723
11374
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10724
11375
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10725
11376
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -10730,9 +11381,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10730
11381
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10731
11382
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10732
11383
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
11384
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
11385
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10733
11386
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
11387
  case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
11388
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
11389
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
11390
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
11391
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
10736
11392
 
10737
11393
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10738
11394
  }
@@ -10810,7 +11466,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10810
11466
 
10811
11467
  std::vector<std::thread> workers;
10812
11468
  workers.reserve(nthread);
10813
- std::mutex mutex;
10814
11469
 
10815
11470
  int idx = 0;
10816
11471
 
@@ -10862,7 +11517,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10862
11517
  quantize &= !params->only_copy;
10863
11518
 
10864
11519
  // do not quantize expert gating tensors
10865
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
11520
+ // NOTE: can't use LLM_TN here because the layer number is not known
11521
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10866
11522
 
10867
11523
  // do not quantize positional embeddings and token types (BERT)
10868
11524
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
@@ -10906,6 +11562,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10906
11562
  }
10907
11563
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10908
11564
  new_type == GGML_TYPE_IQ2_XS ||
11565
+ new_type == GGML_TYPE_IQ2_S ||
10909
11566
  new_type == GGML_TYPE_IQ1_S ||
10910
11567
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10911
11568
  LLAMA_LOG_ERROR("\n\n============================================================\n");
@@ -10922,7 +11579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10922
11579
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
10923
11580
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
10924
11581
  } else {
10925
- llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
11582
+ llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
10926
11583
  f32_data = (float *) f32_conv_buf.data();
10927
11584
  }
10928
11585
 
@@ -10943,41 +11600,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10943
11600
 
10944
11601
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
10945
11602
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
10946
- if (nthread_use < 2) {
10947
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
10948
- } else {
10949
- int counter = 0;
10950
- new_size = 0;
10951
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
10952
- nrows, n_per_row, imatrix]() {
10953
- std::array<int64_t, 1 << 4> local_hist = {};
10954
- const int nrows_per_chunk = chunk_size / n_per_row;
10955
- size_t local_size = 0;
10956
- while (true) {
10957
- std::unique_lock<std::mutex> lock(mutex);
10958
- int first_row = counter; counter += nrows_per_chunk;
10959
- if (first_row >= nrows) {
10960
- if (local_size > 0) {
10961
- for (int j=0; j<int(local_hist.size()); ++j) {
10962
- hist_cur[j] += local_hist[j];
10963
- }
10964
- new_size += local_size;
10965
- }
10966
- break;
10967
- }
10968
- lock.unlock();
10969
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
10970
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
10971
- first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
10972
- }
10973
- };
10974
- for (int it = 0; it < nthread_use - 1; ++it) {
10975
- workers.emplace_back(compute);
10976
- }
10977
- compute();
10978
- for (auto & w : workers) { w.join(); }
10979
- workers.clear();
10980
- }
11603
+ new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
10981
11604
 
10982
11605
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
10983
11606
  int64_t tot_count = 0;
@@ -11327,7 +11950,7 @@ static int llama_apply_lora_from_file_internal(
11327
11950
  struct llama_model_params llama_model_default_params() {
11328
11951
  struct llama_model_params result = {
11329
11952
  /*.n_gpu_layers =*/ 0,
11330
- /*.split_mode =*/ LLAMA_SPLIT_LAYER,
11953
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11331
11954
  /*.main_gpu =*/ 0,
11332
11955
  /*.tensor_split =*/ nullptr,
11333
11956
  /*.progress_callback =*/ nullptr,
@@ -11353,7 +11976,8 @@ struct llama_context_params llama_context_default_params() {
11353
11976
  /*.n_batch =*/ 512,
11354
11977
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11355
11978
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11356
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
11979
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
11980
+ /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
11357
11981
  /*.rope_freq_base =*/ 0.0f,
11358
11982
  /*.rope_freq_scale =*/ 0.0f,
11359
11983
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11361,15 +11985,16 @@ struct llama_context_params llama_context_default_params() {
11361
11985
  /*.yarn_beta_fast =*/ 32.0f,
11362
11986
  /*.yarn_beta_slow =*/ 1.0f,
11363
11987
  /*.yarn_orig_ctx =*/ 0,
11988
+ /*.defrag_thold =*/ -1.0f,
11364
11989
  /*.cb_eval =*/ nullptr,
11365
11990
  /*.cb_eval_user_data =*/ nullptr,
11366
11991
  /*.type_k =*/ GGML_TYPE_F16,
11367
11992
  /*.type_v =*/ GGML_TYPE_F16,
11368
- /*.mul_mat_q =*/ true,
11369
11993
  /*.logits_all =*/ false,
11370
- /*.embedding =*/ false,
11994
+ /*.embeddings =*/ false,
11371
11995
  /*.offload_kqv =*/ true,
11372
- /*.do_pooling =*/ true,
11996
+ /*.abort_callback =*/ nullptr,
11997
+ /*.abort_callback_data =*/ nullptr,
11373
11998
  };
11374
11999
 
11375
12000
  return result;
@@ -11421,15 +12046,6 @@ bool llama_supports_gpu_offload(void) {
11421
12046
  #endif
11422
12047
  }
11423
12048
 
11424
- // deprecated:
11425
- bool llama_mmap_supported(void) {
11426
- return llama_supports_mmap();
11427
- }
11428
-
11429
- bool llama_mlock_supported(void) {
11430
- return llama_supports_mlock();
11431
- }
11432
-
11433
12049
  void llama_backend_init(void) {
11434
12050
  ggml_time_init();
11435
12051
 
@@ -11525,9 +12141,10 @@ struct llama_context * llama_new_context_with_model(
11525
12141
  cparams.yarn_attn_factor = params.yarn_attn_factor;
11526
12142
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11527
12143
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11528
- cparams.mul_mat_q = params.mul_mat_q;
12144
+ cparams.defrag_thold = params.defrag_thold;
12145
+ cparams.embeddings = params.embeddings;
11529
12146
  cparams.offload_kqv = params.offload_kqv;
11530
- cparams.do_pooling = params.do_pooling;
12147
+ cparams.pooling_type = params.pooling_type;
11531
12148
 
11532
12149
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11533
12150
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -11541,16 +12158,24 @@ struct llama_context * llama_new_context_with_model(
11541
12158
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11542
12159
 
11543
12160
  auto rope_scaling_type = params.rope_scaling_type;
11544
- if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
12161
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
11545
12162
  rope_scaling_type = hparams.rope_scaling_type_train;
11546
12163
  }
11547
12164
 
11548
- if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
12165
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
11549
12166
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11550
12167
  }
11551
12168
 
11552
12169
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11553
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
12170
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
12171
+ }
12172
+
12173
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12174
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12175
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
12176
+ } else {
12177
+ cparams.pooling_type = hparams.pooling_type;
12178
+ }
11554
12179
  }
11555
12180
 
11556
12181
  if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11561,8 +12186,11 @@ struct llama_context * llama_new_context_with_model(
11561
12186
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
11562
12187
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
11563
12188
 
11564
- ctx->rng = std::mt19937(params.seed);
11565
- ctx->logits_all = params.logits_all;
12189
+ ctx->abort_callback = params.abort_callback;
12190
+ ctx->abort_callback_data = params.abort_callback_data;
12191
+
12192
+ ctx->rng = std::mt19937(params.seed);
12193
+ ctx->logits_all = params.logits_all;
11566
12194
 
11567
12195
  const ggml_type type_k = params.type_k;
11568
12196
  const ggml_type type_v = params.type_v;
@@ -11584,8 +12212,8 @@ struct llama_context * llama_new_context_with_model(
11584
12212
  }
11585
12213
  #elif defined(GGML_USE_CUBLAS)
11586
12214
  if (model->n_gpu_layers > 0) {
11587
- // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
11588
- if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
12215
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12216
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
11589
12217
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11590
12218
  if (backend == nullptr) {
11591
12219
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11594,7 +12222,7 @@ struct llama_context * llama_new_context_with_model(
11594
12222
  }
11595
12223
  ctx->backends.push_back(backend);
11596
12224
  } else {
11597
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
12225
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11598
12226
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11599
12227
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11600
12228
  if (backend == nullptr) {
@@ -11620,13 +12248,31 @@ struct llama_context * llama_new_context_with_model(
11620
12248
  }
11621
12249
  #elif defined(GGML_USE_SYCL)
11622
12250
  if (model->n_gpu_layers > 0) {
11623
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
11624
- if (backend == nullptr) {
11625
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
11626
- llama_free(ctx);
11627
- return nullptr;
12251
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12252
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12253
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12254
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
12255
+ if (backend == nullptr) {
12256
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
12257
+ llama_free(ctx);
12258
+ return nullptr;
12259
+ }
12260
+ ctx->backends.push_back(backend);
12261
+ } else {
12262
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
12263
+ int id_list[GGML_SYCL_MAX_DEVICES];
12264
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12265
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12266
+ int device_id = id_list[i];
12267
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12268
+ if (backend == nullptr) {
12269
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12270
+ llama_free(ctx);
12271
+ return nullptr;
12272
+ }
12273
+ ctx->backends.push_back(backend);
12274
+ }
11628
12275
  }
11629
- ctx->backends.push_back(backend);
11630
12276
  }
11631
12277
  #elif defined(GGML_USE_KOMPUTE)
11632
12278
  if (model->n_gpu_layers > 0) {
@@ -11647,8 +12293,7 @@ struct llama_context * llama_new_context_with_model(
11647
12293
  }
11648
12294
  ctx->backends.push_back(ctx->backend_cpu);
11649
12295
 
11650
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
11651
- cparams.n_ctx, cparams.offload_kqv)) {
12296
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
11652
12297
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11653
12298
  llama_free(ctx);
11654
12299
  return nullptr;
@@ -11675,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
11675
12320
  // resized during inference, reserve maximum
11676
12321
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11677
12322
 
11678
- if (params.embedding) {
11679
- ctx->embedding.resize(hparams.n_embd);
12323
+ if (params.embeddings) {
12324
+ ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
11680
12325
  }
11681
12326
 
11682
12327
  // graph inputs
@@ -11707,7 +12352,6 @@ struct llama_context * llama_new_context_with_model(
11707
12352
  ggml_set_name(ctx->inp_cls, "inp_cls");
11708
12353
 
11709
12354
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11710
-
11711
12355
  LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
11712
12356
  ggml_backend_buffer_name(ctx->buf_input),
11713
12357
  ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
@@ -11727,7 +12371,7 @@ struct llama_context * llama_new_context_with_model(
11727
12371
  }
11728
12372
 
11729
12373
  // buffer used to store the computation graph and the tensor meta data
11730
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
12374
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11731
12375
 
11732
12376
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
11733
12377
 
@@ -11796,6 +12440,50 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
11796
12440
  return model->vocab.type;
11797
12441
  }
11798
12442
 
12443
+ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12444
+ switch (model->arch) {
12445
+ // these models do not use RoPE
12446
+ case LLM_ARCH_GPT2:
12447
+ case LLM_ARCH_GPTJ:
12448
+ case LLM_ARCH_GPTNEOX:
12449
+ case LLM_ARCH_MPT:
12450
+ case LLM_ARCH_REFACT:
12451
+ case LLM_ARCH_BLOOM:
12452
+ return LLAMA_ROPE_TYPE_NONE;
12453
+
12454
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
12455
+ case LLM_ARCH_LLAMA:
12456
+ case LLM_ARCH_BAICHUAN:
12457
+ case LLM_ARCH_STARCODER:
12458
+ case LLM_ARCH_PLAMO:
12459
+ case LLM_ARCH_CODESHELL:
12460
+ case LLM_ARCH_ORION:
12461
+ case LLM_ARCH_INTERNLM2:
12462
+ case LLM_ARCH_MINICPM:
12463
+ return LLAMA_ROPE_TYPE_NORM;
12464
+
12465
+ // the pairs of head values are offset by n_rot/2
12466
+ case LLM_ARCH_FALCON:
12467
+ case LLM_ARCH_PERSIMMON:
12468
+ case LLM_ARCH_BERT:
12469
+ case LLM_ARCH_NOMIC_BERT:
12470
+ case LLM_ARCH_STABLELM:
12471
+ case LLM_ARCH_QWEN:
12472
+ case LLM_ARCH_QWEN2:
12473
+ case LLM_ARCH_PHI2:
12474
+ case LLM_ARCH_GEMMA:
12475
+ case LLM_ARCH_STARCODER2:
12476
+ return LLAMA_ROPE_TYPE_NEOX;
12477
+
12478
+ // all model arches should be listed explicitly here
12479
+ case LLM_ARCH_UNKNOWN:
12480
+ GGML_ASSERT(false && "unknown architecture");
12481
+ break;
12482
+ }
12483
+
12484
+ return LLAMA_ROPE_TYPE_NONE;
12485
+ }
12486
+
11799
12487
  int32_t llama_n_vocab(const struct llama_model * model) {
11800
12488
  return model->vocab.id_to_token.size();
11801
12489
  }
@@ -11898,15 +12586,6 @@ uint32_t llama_model_quantize(
11898
12586
  }
11899
12587
  }
11900
12588
 
11901
- int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11902
- try {
11903
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
11904
- } catch (const std::exception & err) {
11905
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
11906
- return 1;
11907
- }
11908
- }
11909
-
11910
12589
  int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11911
12590
  try {
11912
12591
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -12038,12 +12717,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
12038
12717
  llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
12039
12718
  }
12040
12719
 
12041
- void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12720
+ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12042
12721
  if (delta == 0) {
12043
12722
  return;
12044
12723
  }
12045
12724
 
12046
- llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
12725
+ llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
12047
12726
  }
12048
12727
 
12049
12728
  void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
@@ -12054,6 +12733,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
12054
12733
  llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
12055
12734
  }
12056
12735
 
12736
+ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
12737
+ return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
12738
+ }
12739
+
12740
+ void llama_kv_cache_defrag(struct llama_context * ctx) {
12741
+ llama_kv_cache_defrag(ctx->kv_self);
12742
+ }
12743
+
12744
+ void llama_kv_cache_update(struct llama_context * ctx) {
12745
+ llama_kv_cache_update_internal(*ctx);
12746
+ }
12747
+
12748
+
12057
12749
  // Returns the *maximum* size of the state
12058
12750
  size_t llama_get_state_size(const struct llama_context * ctx) {
12059
12751
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -12064,10 +12756,15 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12064
12756
  // assume worst case for logits although only currently set ones are serialized
12065
12757
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
12066
12758
  const size_t s_embedding_size = sizeof(size_t);
12067
- const size_t s_embedding = ctx->embedding.size() * sizeof(float);
12068
- const size_t s_kv_size = sizeof(size_t);
12069
- const size_t s_kv_ntok = sizeof(int);
12759
+ const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
12760
+ const size_t s_kv_buf_size = sizeof(size_t);
12761
+ const size_t s_kv_head = sizeof(uint32_t);
12762
+ const size_t s_kv_size = sizeof(uint32_t);
12763
+ const size_t s_kv_used = sizeof(uint32_t);
12070
12764
  const size_t s_kv = ctx->kv_self.total_size();
12765
+ // TODO: assume the max is more than 1 seq_id per KV cell
12766
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
12767
+ const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
12071
12768
 
12072
12769
  const size_t s_total = (
12073
12770
  + s_rng_size
@@ -12076,9 +12773,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12076
12773
  + s_logits
12077
12774
  + s_embedding_size
12078
12775
  + s_embedding
12776
+ + s_kv_buf_size
12777
+ + s_kv_head
12079
12778
  + s_kv_size
12080
- + s_kv_ntok
12779
+ + s_kv_used
12081
12780
  + s_kv
12781
+ + s_kv_cells
12082
12782
  );
12083
12783
 
12084
12784
  return s_total;
@@ -12165,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12165
12865
 
12166
12866
  // copy embeddings
12167
12867
  {
12168
- const size_t embedding_size = ctx->embedding.size();
12868
+ const size_t embeddings_size = ctx->embd.size();
12169
12869
 
12170
- data_ctx->write(&embedding_size, sizeof(embedding_size));
12870
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
12171
12871
 
12172
- if (embedding_size) {
12173
- data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
12872
+ if (embeddings_size) {
12873
+ data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
12174
12874
  }
12175
12875
  }
12176
12876
 
@@ -12178,15 +12878,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12178
12878
  {
12179
12879
  const auto & kv_self = ctx->kv_self;
12180
12880
  const auto & hparams = ctx->model.hparams;
12181
- const auto & cparams = ctx->cparams;
12182
12881
 
12183
- const auto n_layer = hparams.n_layer;
12184
- const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
12185
- const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
12186
- const auto n_ctx = cparams.n_ctx;
12882
+ const uint32_t n_layer = hparams.n_layer;
12883
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12884
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12187
12885
 
12188
12886
  const size_t kv_buf_size = kv_self.total_size();
12189
- const uint32_t kv_head = kv_self.head;
12887
+ const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
12190
12888
  const uint32_t kv_size = kv_self.size;
12191
12889
  const uint32_t kv_used = kv_self.used;
12192
12890
 
@@ -12198,14 +12896,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12198
12896
  if (kv_buf_size) {
12199
12897
  std::vector<uint8_t> tmp_buf;
12200
12898
  for (int il = 0; il < (int) n_layer; ++il) {
12201
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12899
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12900
+
12202
12901
  tmp_buf.resize(k_size);
12203
12902
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12204
12903
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12205
12904
 
12206
12905
  // v is not contiguous, copy row by row
12207
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12906
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12907
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12908
+
12209
12909
  tmp_buf.resize(v_row_size);
12210
12910
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12211
12911
  ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
@@ -12214,7 +12914,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12214
12914
  }
12215
12915
  }
12216
12916
 
12217
- for (uint32_t i = 0; i < kv_size; ++i) {
12917
+ for (uint32_t i = 0; i < kv_head; ++i) {
12218
12918
  const auto & cell = kv_self.cells[i];
12219
12919
 
12220
12920
  const llama_pos pos = cell.pos;
@@ -12238,8 +12938,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
12238
12938
  }
12239
12939
 
12240
12940
  // Sets the state reading from the specified source address
12241
- size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12242
- uint8_t * inp = src;
12941
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12942
+ const uint8_t * inp = src;
12243
12943
 
12244
12944
  // set rng
12245
12945
  {
@@ -12248,7 +12948,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12248
12948
 
12249
12949
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
12250
12950
 
12251
- std::string rng_str((char *)inp, rng_size); inp += rng_size;
12951
+ std::string rng_str((const char *)inp, rng_size); inp += rng_size;
12252
12952
 
12253
12953
  std::istringstream rng_ss(rng_str);
12254
12954
  rng_ss >> ctx->rng;
@@ -12274,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12274
12974
 
12275
12975
  // set embeddings
12276
12976
  {
12277
- size_t embedding_size;
12977
+ size_t embeddings_size;
12978
+
12979
+ memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
12278
12980
 
12279
- memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
12981
+ GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
12280
12982
 
12281
- GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
12983
+ if (embeddings_size) {
12984
+ ctx->embd.resize(embeddings_size);
12282
12985
 
12283
- if (embedding_size) {
12284
- memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12285
- inp += embedding_size * sizeof(float);
12986
+ memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
12987
+ inp += embeddings_size * sizeof(float);
12286
12988
  }
12287
12989
  }
12288
12990
 
@@ -12290,12 +12992,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12290
12992
  {
12291
12993
  const auto & kv_self = ctx->kv_self;
12292
12994
  const auto & hparams = ctx->model.hparams;
12293
- const auto & cparams = ctx->cparams;
12294
12995
 
12295
- const int n_layer = hparams.n_layer;
12296
- const int n_embd_k_gqa = hparams.n_embd_k_gqa();
12297
- const int n_embd_v_gqa = hparams.n_embd_v_gqa();
12298
- const int n_ctx = cparams.n_ctx;
12996
+ const uint32_t n_layer = hparams.n_layer;
12997
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12998
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12299
12999
 
12300
13000
  size_t kv_buf_size;
12301
13001
  uint32_t kv_head;
@@ -12311,13 +13011,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12311
13011
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12312
13012
 
12313
13013
  for (int il = 0; il < (int) n_layer; ++il) {
12314
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
13014
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
13015
+
12315
13016
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12316
13017
  inp += k_size;
12317
13018
 
12318
13019
  // v is not contiguous, copy row by row
12319
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13020
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
13021
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
13022
+
12321
13023
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12322
13024
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
12323
13025
  inp += v_row_size;
@@ -12325,13 +13027,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12325
13027
  }
12326
13028
  }
12327
13029
 
13030
+ GGML_ASSERT(kv_self.size == kv_size);
13031
+
12328
13032
  ctx->kv_self.head = kv_head;
12329
13033
  ctx->kv_self.size = kv_size;
12330
13034
  ctx->kv_self.used = kv_used;
12331
13035
 
12332
13036
  ctx->kv_self.cells.resize(kv_size);
12333
13037
 
12334
- for (uint32_t i = 0; i < kv_size; ++i) {
13038
+ for (uint32_t i = 0; i < kv_head; ++i) {
12335
13039
  llama_pos pos;
12336
13040
  size_t seq_id_size;
12337
13041
 
@@ -12347,6 +13051,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12347
13051
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
12348
13052
  }
12349
13053
  }
13054
+
13055
+ for (uint32_t i = kv_head; i < kv_size; ++i) {
13056
+ ctx->kv_self.cells[i].pos = -1;
13057
+ ctx->kv_self.cells[i].seq_id.clear();
13058
+ }
12350
13059
  }
12351
13060
 
12352
13061
  const size_t nread = inp - src;
@@ -12439,43 +13148,16 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
12439
13148
  return true;
12440
13149
  }
12441
13150
 
12442
- int llama_eval(
12443
- struct llama_context * ctx,
12444
- llama_token * tokens,
12445
- int32_t n_tokens,
12446
- int32_t n_past) {
12447
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12448
-
12449
- const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
12450
- if (ret < 0) {
12451
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12452
- }
12453
-
12454
- return ret;
12455
- }
12456
-
12457
- int llama_eval_embd(
12458
- struct llama_context * ctx,
12459
- float * embd,
12460
- int32_t n_tokens,
12461
- int32_t n_past) {
12462
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12463
-
12464
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
12465
-
12466
- const int ret = llama_decode_internal(*ctx, batch);
12467
- if (ret < 0) {
12468
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12469
- }
12470
-
12471
- return ret;
12472
- }
12473
-
12474
13151
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
12475
13152
  ctx->cparams.n_threads = n_threads;
12476
13153
  ctx->cparams.n_threads_batch = n_threads_batch;
12477
13154
  }
12478
13155
 
13156
+ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
13157
+ ctx->abort_callback = abort_callback;
13158
+ ctx->abort_callback_data = abort_callback_data;
13159
+ }
13160
+
12479
13161
  struct llama_batch llama_batch_get_one(
12480
13162
  llama_token * tokens,
12481
13163
  int32_t n_tokens,
@@ -12552,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
12552
13234
  }
12553
13235
 
12554
13236
  float * llama_get_embeddings(struct llama_context * ctx) {
12555
- return ctx->embedding.data();
13237
+ return ctx->embd.data();
12556
13238
  }
12557
13239
 
12558
13240
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12559
- return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
13241
+ return ctx->embd.data() + i*ctx->model.hparams.n_embd;
13242
+ }
13243
+
13244
+ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
13245
+ auto it = ctx->embd_seq.find(seq_id);
13246
+ if (it == ctx->embd_seq.end()) {
13247
+ return nullptr;
13248
+ }
13249
+
13250
+ return it->second.data();
12560
13251
  }
12561
13252
 
12562
13253
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
@@ -12730,7 +13421,7 @@ static int32_t llama_chat_apply_template_internal(
12730
13421
  std::string & dest, bool add_ass) {
12731
13422
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12732
13423
  std::stringstream ss;
12733
- if (tmpl.find("<|im_start|>") != std::string::npos) {
13424
+ if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
12734
13425
  // chatml template
12735
13426
  for (auto message : chat) {
12736
13427
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -12738,7 +13429,7 @@ static int32_t llama_chat_apply_template_internal(
12738
13429
  if (add_ass) {
12739
13430
  ss << "<|im_start|>assistant\n";
12740
13431
  }
12741
- } else if (tmpl.find("[INST]") != std::string::npos) {
13432
+ } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
12742
13433
  // llama2 template and its variants
12743
13434
  // [variant] support system message
12744
13435
  bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@@ -12773,7 +13464,7 @@ static int32_t llama_chat_apply_template_internal(
12773
13464
  }
12774
13465
  }
12775
13466
  // llama2 templates seem to not care about "add_generation_prompt"
12776
- } else if (tmpl.find("<|user|>") != std::string::npos) {
13467
+ } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
12777
13468
  // zephyr template
12778
13469
  for (auto message : chat) {
12779
13470
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -12781,7 +13472,7 @@ static int32_t llama_chat_apply_template_internal(
12781
13472
  if (add_ass) {
12782
13473
  ss << "<|assistant|>\n";
12783
13474
  }
12784
- } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
13475
+ } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
12785
13476
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
12786
13477
  for (auto message : chat) {
12787
13478
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -12790,7 +13481,7 @@ static int32_t llama_chat_apply_template_internal(
12790
13481
  if (add_ass) {
12791
13482
  ss << "<s>assistant\n";
12792
13483
  }
12793
- } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
13484
+ } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
12794
13485
  // google/gemma-7b-it
12795
13486
  std::string system_prompt = "";
12796
13487
  for (auto message : chat) {
@@ -12837,23 +13528,27 @@ LLAMA_API int32_t llama_chat_apply_template(
12837
13528
  int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
12838
13529
  if (res < 0) {
12839
13530
  // worst case: there is no information about template, we will use chatml by default
12840
- curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
13531
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
12841
13532
  } else {
12842
13533
  curr_tmpl = std::string(model_template.data(), model_template.size());
12843
13534
  }
12844
13535
  }
13536
+
12845
13537
  // format the chat to string
12846
13538
  std::vector<const llama_chat_message *> chat_vec;
12847
13539
  chat_vec.resize(n_msg);
12848
13540
  for (size_t i = 0; i < n_msg; i++) {
12849
13541
  chat_vec[i] = &chat[i];
12850
13542
  }
13543
+
12851
13544
  std::string formatted_chat;
12852
13545
  int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12853
13546
  if (res < 0) {
12854
13547
  return res;
12855
13548
  }
12856
- strncpy(buf, formatted_chat.c_str(), length);
13549
+ if (buf && length > 0) {
13550
+ strncpy(buf, formatted_chat.c_str(), length);
13551
+ }
12857
13552
  return res;
12858
13553
  }
12859
13554