llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -22,16 +26,9 @@
22
26
  #ifdef GGML_USE_METAL
23
27
  # include "ggml-metal.h"
24
28
  #endif
25
- #ifdef GGML_USE_MPI
26
- # include "ggml-mpi.h"
27
- #endif
28
- #ifndef QK_K
29
- # ifdef GGML_QKK_64
30
- # define QK_K 64
31
- # else
32
- # define QK_K 256
33
- # endif
34
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
35
32
 
36
33
  #ifdef __has_include
37
34
  #if __has_include(<unistd.h>)
@@ -106,7 +103,7 @@
106
103
  #endif
107
104
 
108
105
  #define LLAMA_MAX_NODES 8192
109
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 128
110
107
 
111
108
  //
112
109
  // logging
@@ -201,10 +198,10 @@ enum llm_arch {
201
198
  LLM_ARCH_GPTNEOX,
202
199
  LLM_ARCH_MPT,
203
200
  LLM_ARCH_STARCODER,
204
- LLM_ARCH_PERSIMMON,
205
201
  LLM_ARCH_REFACT,
206
202
  LLM_ARCH_BERT,
207
203
  LLM_ARCH_NOMIC_BERT,
204
+ LLM_ARCH_JINA_BERT_V2,
208
205
  LLM_ARCH_BLOOM,
209
206
  LLM_ARCH_STABLELM,
210
207
  LLM_ARCH_QWEN,
@@ -224,43 +221,45 @@ enum llm_arch {
224
221
  LLM_ARCH_COMMAND_R,
225
222
  LLM_ARCH_DBRX,
226
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
227
225
  LLM_ARCH_UNKNOWN,
228
226
  };
229
227
 
230
228
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
229
+ { LLM_ARCH_LLAMA, "llama" },
230
+ { LLM_ARCH_FALCON, "falcon" },
231
+ { LLM_ARCH_GROK, "grok" },
232
+ { LLM_ARCH_GPT2, "gpt2" },
233
+ { LLM_ARCH_GPTJ, "gptj" },
234
+ { LLM_ARCH_GPTNEOX, "gptneox" },
235
+ { LLM_ARCH_MPT, "mpt" },
236
+ { LLM_ARCH_BAICHUAN, "baichuan" },
237
+ { LLM_ARCH_STARCODER, "starcoder" },
238
+ { LLM_ARCH_REFACT, "refact" },
239
+ { LLM_ARCH_BERT, "bert" },
240
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
241
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
242
+ { LLM_ARCH_BLOOM, "bloom" },
243
+ { LLM_ARCH_STABLELM, "stablelm" },
244
+ { LLM_ARCH_QWEN, "qwen" },
245
+ { LLM_ARCH_QWEN2, "qwen2" },
246
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
247
+ { LLM_ARCH_PHI2, "phi2" },
248
+ { LLM_ARCH_PHI3, "phi3" },
249
+ { LLM_ARCH_PLAMO, "plamo" },
250
+ { LLM_ARCH_CODESHELL, "codeshell" },
251
+ { LLM_ARCH_ORION, "orion" },
252
+ { LLM_ARCH_INTERNLM2, "internlm2" },
253
+ { LLM_ARCH_MINICPM, "minicpm" },
254
+ { LLM_ARCH_GEMMA, "gemma" },
255
+ { LLM_ARCH_STARCODER2, "starcoder2" },
256
+ { LLM_ARCH_MAMBA, "mamba" },
257
+ { LLM_ARCH_XVERSE, "xverse" },
258
+ { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
261
+ { LLM_ARCH_ARCTIC, "arctic" },
262
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
263
  };
265
264
 
266
265
  enum llm_kv {
@@ -303,6 +302,7 @@ enum llm_kv {
303
302
  LLM_KV_ROPE_SCALE_LINEAR,
304
303
  LLM_KV_ROPE_SCALING_TYPE,
305
304
  LLM_KV_ROPE_SCALING_FACTOR,
305
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
306
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
307
  LLM_KV_ROPE_SCALING_FINETUNED,
308
308
 
@@ -380,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
380
380
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
381
381
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
382
382
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
383
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
383
384
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
384
385
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
385
386
 
@@ -435,6 +436,8 @@ enum llm_tensor {
435
436
  LLM_TENSOR_OUTPUT,
436
437
  LLM_TENSOR_OUTPUT_NORM,
437
438
  LLM_TENSOR_ROPE_FREQS,
439
+ LLM_TENSOR_ROPE_FACTORS_LONG,
440
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
438
441
  LLM_TENSOR_ATTN_Q,
439
442
  LLM_TENSOR_ATTN_K,
440
443
  LLM_TENSOR_ATTN_V,
@@ -454,6 +457,7 @@ enum llm_tensor {
454
457
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
455
458
  LLM_TENSOR_FFN_GATE_EXP,
456
459
  LLM_TENSOR_FFN_UP_EXP,
460
+ LLM_TENSOR_FFN_NORM_EXPS,
457
461
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
458
462
  LLM_TENSOR_FFN_GATE_EXPS,
459
463
  LLM_TENSOR_FFN_UP_EXPS,
@@ -592,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
592
596
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
593
597
  },
594
598
  },
595
- {
596
- LLM_ARCH_PERSIMMON,
597
- {
598
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
599
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
600
- { LLM_TENSOR_OUTPUT, "output"},
601
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
602
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
603
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
604
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
605
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
606
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
607
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
608
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
609
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
610
- },
611
- },
612
599
  {
613
600
  LLM_ARCH_MPT,
614
601
  {
@@ -691,6 +678,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
678
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
679
  },
693
680
  },
681
+ {
682
+ LLM_ARCH_JINA_BERT_V2,
683
+ {
684
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
685
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
686
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
687
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
688
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
689
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
690
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
691
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
692
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
693
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
694
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
695
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
696
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
697
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
698
+ },
699
+ },
694
700
  {
695
701
  LLM_ARCH_BLOOM,
696
702
  {
@@ -800,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
800
806
  {
801
807
  LLM_ARCH_PHI3,
802
808
  {
803
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
804
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
805
- { LLM_TENSOR_OUTPUT, "output" },
806
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
807
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
808
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
809
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
810
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
811
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
812
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
813
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
814
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
809
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
810
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
811
+ { LLM_TENSOR_OUTPUT, "output" },
812
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
813
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
816
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
817
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
815
823
  },
816
824
  },
817
825
  {
@@ -1027,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1027
1035
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1028
1036
  },
1029
1037
  },
1038
+ {
1039
+ LLM_ARCH_ARCTIC,
1040
+ {
1041
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1042
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1043
+ { LLM_TENSOR_OUTPUT, "output" },
1044
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1050
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1051
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1052
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1053
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1054
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1055
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1056
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1057
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
+ },
1059
+ },
1030
1060
  {
1031
1061
  LLM_ARCH_UNKNOWN,
1032
1062
  {
@@ -1664,91 +1694,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1694
  GGML_UNUSED(host_buffer);
1665
1695
  }
1666
1696
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1697
  //
1753
1698
  // globals
1754
1699
  //
@@ -1757,6 +1702,8 @@ struct llama_state {
1757
1702
  llama_state() {
1758
1703
  #ifdef GGML_USE_METAL
1759
1704
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1705
+ #elif defined(GGML_USE_CUDA)
1706
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1760
1707
  #endif
1761
1708
  }
1762
1709
 
@@ -1770,17 +1717,24 @@ static llama_state g_state;
1770
1717
  // available llama models
1771
1718
  enum e_model {
1772
1719
  MODEL_UNKNOWN,
1720
+ MODEL_14M,
1773
1721
  MODEL_17M,
1774
1722
  MODEL_22M,
1775
1723
  MODEL_33M,
1724
+ MODEL_70M,
1776
1725
  MODEL_109M,
1777
1726
  MODEL_137M,
1727
+ MODEL_160M,
1778
1728
  MODEL_335M,
1729
+ MODEL_410M,
1779
1730
  MODEL_0_5B,
1780
1731
  MODEL_1B,
1732
+ MODEL_1_4B,
1781
1733
  MODEL_2B,
1734
+ MODEL_2_8B,
1782
1735
  MODEL_3B,
1783
1736
  MODEL_4B,
1737
+ MODEL_6_9B,
1784
1738
  MODEL_7B,
1785
1739
  MODEL_8B,
1786
1740
  MODEL_12B,
@@ -1803,6 +1757,7 @@ enum e_model {
1803
1757
  MODEL_8x7B,
1804
1758
  MODEL_8x22B,
1805
1759
  MODEL_16x12B,
1760
+ MODEL_10B_128x3_66B,
1806
1761
  };
1807
1762
 
1808
1763
  static const size_t kiB = 1024;
@@ -1812,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
1812
1767
  struct llama_hparams {
1813
1768
  bool vocab_only;
1814
1769
  bool rope_finetuned;
1770
+ bool use_par_res;
1815
1771
 
1816
1772
  uint32_t n_vocab;
1817
1773
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1830,6 +1786,7 @@ struct llama_hparams {
1830
1786
  float f_norm_eps;
1831
1787
  float f_norm_rms_eps;
1832
1788
 
1789
+ float rope_attn_factor = 1.0f;
1833
1790
  float rope_freq_base_train;
1834
1791
  float rope_freq_scale_train;
1835
1792
  uint32_t n_yarn_orig_ctx;
@@ -1845,7 +1802,7 @@ struct llama_hparams {
1845
1802
  float f_logit_scale = 0.0f;
1846
1803
 
1847
1804
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1805
+ bool use_alibi = false;
1849
1806
 
1850
1807
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1808
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1878,6 +1835,7 @@ struct llama_hparams {
1878
1835
 
1879
1836
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1880
1837
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1838
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1881
1839
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1882
1840
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1883
1841
 
@@ -1975,6 +1933,7 @@ struct llama_layer {
1975
1933
  struct ggml_tensor * ffn_norm_b;
1976
1934
  struct ggml_tensor * layer_out_norm;
1977
1935
  struct ggml_tensor * layer_out_norm_b;
1936
+ struct ggml_tensor * ffn_norm_exps;
1978
1937
 
1979
1938
  // ff
1980
1939
  struct ggml_tensor * ffn_gate; // w1
@@ -2012,6 +1971,10 @@ struct llama_layer {
2012
1971
  // mamba bias
2013
1972
  struct ggml_tensor * ssm_conv1d_b;
2014
1973
  struct ggml_tensor * ssm_dt_b;
1974
+
1975
+ // long rope factors
1976
+ struct ggml_tensor * rope_long = nullptr;
1977
+ struct ggml_tensor * rope_short = nullptr;
2015
1978
  };
2016
1979
 
2017
1980
  struct llama_kv_cell {
@@ -2189,6 +2152,8 @@ struct llama_model {
2189
2152
  int main_gpu;
2190
2153
  int n_gpu_layers;
2191
2154
 
2155
+ std::vector<std::string> rpc_servers;
2156
+
2192
2157
  // gguf metadata
2193
2158
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2159
 
@@ -2317,7 +2282,6 @@ struct llama_context {
2317
2282
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2283
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2284
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2285
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2286
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2287
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2327,11 +2291,105 @@ struct llama_context {
2327
2291
 
2328
2292
  // control vectors
2329
2293
  struct llama_control_vector cvec;
2294
+ };
2295
+
2296
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2297
+ ggml_backend_buffer_type_t buft = nullptr;
2298
+
2299
+ #ifdef GGML_USE_RPC
2300
+ std::string endpoint = model.rpc_servers[gpu];
2301
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2302
+ #elif defined(GGML_USE_METAL)
2303
+ buft = ggml_backend_metal_buffer_type();
2304
+ #elif defined(GGML_USE_CUDA)
2305
+ buft = ggml_backend_cuda_buffer_type(gpu);
2306
+ #elif defined(GGML_USE_VULKAN)
2307
+ buft = ggml_backend_vk_buffer_type(gpu);
2308
+ #elif defined(GGML_USE_SYCL)
2309
+ buft = ggml_backend_sycl_buffer_type(gpu);
2310
+ #elif defined(GGML_USE_CLBLAST)
2311
+ buft = ggml_backend_opencl_buffer_type();
2312
+ #elif defined(GGML_USE_KOMPUTE)
2313
+ buft = ggml_backend_kompute_buffer_type(gpu);
2314
+ if (buft == nullptr) {
2315
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2316
+ }
2317
+ #endif
2318
+
2319
+ if (buft == nullptr) {
2320
+ buft = llama_default_buffer_type_cpu(true);
2321
+ }
2322
+ return buft;
2323
+ GGML_UNUSED(model);
2324
+ GGML_UNUSED(gpu);
2325
+ }
2330
2326
 
2331
- #ifdef GGML_USE_MPI
2332
- ggml_mpi_context * ctx_mpi = NULL;
2327
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2328
+ ggml_backend_buffer_type_t buft = nullptr;
2329
+
2330
+ #ifdef GGML_USE_CUDA
2331
+ if (ggml_backend_cuda_get_device_count() > 1) {
2332
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2333
+ }
2333
2334
  #endif
2334
- };
2335
+
2336
+ #ifdef GGML_USE_SYCL
2337
+ if (ggml_backend_sycl_get_device_count() > 1) {
2338
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2339
+ }
2340
+ #endif
2341
+
2342
+ if (buft == nullptr) {
2343
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2344
+ }
2345
+ return buft;
2346
+
2347
+ GGML_UNUSED(tensor_split);
2348
+ }
2349
+
2350
+ static size_t llama_get_device_count(const llama_model & model) {
2351
+ #if defined(GGML_USE_RPC)
2352
+ return model.rpc_servers.size();
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ return ggml_backend_cuda_get_device_count();
2355
+ #elif defined(GGML_USE_SYCL)
2356
+ return ggml_backend_sycl_get_device_count();
2357
+ #elif defined(GGML_USE_VULKAN)
2358
+ return ggml_backend_vk_get_device_count();
2359
+ #else
2360
+ return 1;
2361
+ #endif
2362
+ GGML_UNUSED(model);
2363
+ }
2364
+
2365
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2366
+ #if defined(GGML_USE_RPC)
2367
+ size_t total;
2368
+ size_t free;
2369
+ std::string endpoint = model.rpc_servers[device];
2370
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2371
+ return free;
2372
+ #elif defined(GGML_USE_CUDA)
2373
+ size_t total;
2374
+ size_t free;
2375
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2376
+ return free;
2377
+ #elif defined(GGML_USE_SYCL)
2378
+ size_t total;
2379
+ size_t free;
2380
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2381
+ return free;
2382
+ #elif defined(GGML_USE_VULKAN)
2383
+ size_t total;
2384
+ size_t free;
2385
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2386
+ return free;
2387
+ #else
2388
+ return 1;
2389
+ #endif
2390
+ GGML_UNUSED(model);
2391
+ GGML_UNUSED(device);
2392
+ }
2335
2393
 
2336
2394
  //
2337
2395
  // kv cache helpers
@@ -2452,7 +2510,6 @@ static bool llama_kv_cache_init(
2452
2510
  static bool llama_kv_cache_find_slot(
2453
2511
  struct llama_kv_cache & cache,
2454
2512
  const struct llama_batch & batch) {
2455
- const uint32_t n_ctx = cache.size;
2456
2513
  const uint32_t n_tokens = batch.n_tokens;
2457
2514
 
2458
2515
  if (cache.recurrent) {
@@ -2503,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
2503
2560
  }
2504
2561
  // otherwise, one cell per token.
2505
2562
 
2506
- if (n_tokens > n_ctx) {
2507
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2563
+ if (n_tokens > cache.size) {
2564
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2508
2565
  return false;
2509
2566
  }
2510
2567
 
2511
2568
  uint32_t n_tested = 0;
2512
2569
 
2513
2570
  while (true) {
2514
- if (cache.head + n_tokens > n_ctx) {
2515
- n_tested += n_ctx - cache.head;
2571
+ if (cache.head + n_tokens > cache.size) {
2572
+ n_tested += cache.size - cache.head;
2516
2573
  cache.head = 0;
2517
2574
  continue;
2518
2575
  }
@@ -2531,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
2531
2588
  break;
2532
2589
  }
2533
2590
 
2534
- if (n_tested >= n_ctx) {
2591
+ if (n_tested >= cache.size) {
2535
2592
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2536
2593
  return false;
2537
2594
  }
@@ -2785,6 +2842,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2842
  cache.do_defrag = true;
2786
2843
  }
2787
2844
 
2845
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2846
+ // the FA kernels require padding to avoid extra runtime boundary checks
2847
+ return cparams.flash_attn ? 256u : 32u;
2848
+ }
2849
+
2788
2850
  //
2789
2851
  // model loading and saving
2790
2852
  //
@@ -3287,22 +3349,55 @@ struct llama_model_loader {
3287
3349
  }
3288
3350
 
3289
3351
  template<typename T>
3290
- bool get_key(const std::string & key, T & result, const bool required = true) {
3291
- auto it = kv_overrides.find(key);
3352
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3353
+ const int kid = gguf_find_key(meta, key.c_str());
3292
3354
 
3293
- const struct llama_model_kv_override * override =
3294
- it != kv_overrides.end() ? &it->second : nullptr;
3355
+ if (kid < 0) {
3356
+ if (required) {
3357
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3358
+ }
3359
+ return false;
3360
+ }
3295
3361
 
3296
- const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3362
+ struct GGUFMeta::ArrayInfo arr_info =
3363
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3297
3364
 
3298
- if (required && !found) {
3299
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3365
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3366
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3300
3367
  }
3301
3368
 
3302
- return found;
3303
- }
3369
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3370
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3371
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3304
3372
 
3305
- template<typename T>
3373
+ result.resize(arr_info.length);
3374
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3375
+
3376
+ return true;
3377
+ }
3378
+
3379
+ template<typename T>
3380
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3381
+ return get_arr(llm_kv(kid), result, required);
3382
+ }
3383
+
3384
+ template<typename T>
3385
+ bool get_key(const std::string & key, T & result, const bool required = true) {
3386
+ auto it = kv_overrides.find(key);
3387
+
3388
+ const struct llama_model_kv_override * override =
3389
+ it != kv_overrides.end() ? &it->second : nullptr;
3390
+
3391
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3392
+
3393
+ if (required && !found) {
3394
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3395
+ }
3396
+
3397
+ return found;
3398
+ }
3399
+
3400
+ template<typename T>
3306
3401
  bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
3307
3402
  return get_key(llm_kv(kid), result, required);
3308
3403
  }
@@ -3360,11 +3455,15 @@ struct llama_model_loader {
3360
3455
  return get_tensor_meta(get_tensor_name(i));
3361
3456
  }
3362
3457
 
3363
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3458
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3364
3459
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3365
3460
  ggml_set_name(tensor, ggml_get_name(cur));
3366
3461
 
3367
- n_created++;
3462
+ if (duplicated) {
3463
+ size_data += ggml_nbytes(cur);
3464
+ } else {
3465
+ n_created++;
3466
+ }
3368
3467
 
3369
3468
  return tensor;
3370
3469
  }
@@ -3399,14 +3498,17 @@ struct llama_model_loader {
3399
3498
  return cur;
3400
3499
  }
3401
3500
 
3402
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3403
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3501
+ static const int TENSOR_NOT_REQUIRED = 1;
3502
+ static const int TENSOR_DUPLICATED = 2;
3503
+
3504
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3505
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3404
3506
 
3405
3507
  if (cur == NULL) {
3406
3508
  return NULL;
3407
3509
  }
3408
3510
 
3409
- return create_tensor_for(ctx, cur);
3511
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3410
3512
  }
3411
3513
 
3412
3514
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3706,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3706
3808
 
3707
3809
  static const char * llama_model_type_name(e_model type) {
3708
3810
  switch (type) {
3709
- case MODEL_22M: return "22M";
3710
- case MODEL_33M: return "33M";
3711
- case MODEL_109M: return "109M";
3712
- case MODEL_137M: return "137M";
3713
- case MODEL_0_5B: return "0.5B";
3714
- case MODEL_1B: return "1B";
3715
- case MODEL_2B: return "2B";
3716
- case MODEL_3B: return "3B";
3717
- case MODEL_7B: return "7B";
3718
- case MODEL_8B: return "8B";
3719
- case MODEL_12B: return "12B";
3720
- case MODEL_13B: return "13B";
3721
- case MODEL_14B: return "14B";
3722
- case MODEL_15B: return "15B";
3723
- case MODEL_20B: return "20B";
3724
- case MODEL_30B: return "30B";
3725
- case MODEL_34B: return "34B";
3726
- case MODEL_35B: return "35B";
3727
- case MODEL_40B: return "40B";
3728
- case MODEL_65B: return "65B";
3729
- case MODEL_70B: return "70B";
3730
- case MODEL_314B: return "314B";
3731
- case MODEL_SMALL: return "0.1B";
3732
- case MODEL_MEDIUM: return "0.4B";
3733
- case MODEL_LARGE: return "0.8B";
3734
- case MODEL_XL: return "1.5B";
3735
- case MODEL_A2_7B: return "A2.7B";
3736
- case MODEL_8x7B: return "8x7B";
3737
- case MODEL_8x22B: return "8x22B";
3738
- case MODEL_16x12B: return "16x12B";
3739
- default: return "?B";
3811
+ case MODEL_14M: return "14M";
3812
+ case MODEL_17M: return "17M";
3813
+ case MODEL_22M: return "22M";
3814
+ case MODEL_33M: return "33M";
3815
+ case MODEL_70M: return "70M";
3816
+ case MODEL_109M: return "109M";
3817
+ case MODEL_137M: return "137M";
3818
+ case MODEL_160M: return "160M";
3819
+ case MODEL_335M: return "335M";
3820
+ case MODEL_410M: return "410M";
3821
+ case MODEL_0_5B: return "0.5B";
3822
+ case MODEL_1B: return "1B";
3823
+ case MODEL_1_4B: return "1.4B";
3824
+ case MODEL_2B: return "2B";
3825
+ case MODEL_2_8B: return "2.8B";
3826
+ case MODEL_3B: return "3B";
3827
+ case MODEL_4B: return "4B";
3828
+ case MODEL_6_9B: return "6.9B";
3829
+ case MODEL_7B: return "7B";
3830
+ case MODEL_8B: return "8B";
3831
+ case MODEL_12B: return "12B";
3832
+ case MODEL_13B: return "13B";
3833
+ case MODEL_14B: return "14B";
3834
+ case MODEL_15B: return "15B";
3835
+ case MODEL_20B: return "20B";
3836
+ case MODEL_30B: return "30B";
3837
+ case MODEL_34B: return "34B";
3838
+ case MODEL_35B: return "35B";
3839
+ case MODEL_40B: return "40B";
3840
+ case MODEL_65B: return "65B";
3841
+ case MODEL_70B: return "70B";
3842
+ case MODEL_314B: return "314B";
3843
+ case MODEL_SMALL: return "0.1B";
3844
+ case MODEL_MEDIUM: return "0.4B";
3845
+ case MODEL_LARGE: return "0.8B";
3846
+ case MODEL_XL: return "1.5B";
3847
+ case MODEL_A2_7B: return "A2.7B";
3848
+ case MODEL_8x7B: return "8x7B";
3849
+ case MODEL_8x22B: return "8x22B";
3850
+ case MODEL_16x12B: return "16x12B";
3851
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3852
+ default: return "?B";
3740
3853
  }
3741
3854
  }
3742
3855
 
@@ -3779,6 +3892,12 @@ static void llm_load_hparams(
3779
3892
 
3780
3893
  // get hparams kv
3781
3894
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3895
+
3896
+ // everything past this point is not vocab-related
3897
+ if (hparams.vocab_only) {
3898
+ return;
3899
+ }
3900
+
3782
3901
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3783
3902
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3784
3903
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3823,6 +3942,8 @@ static void llm_load_hparams(
3823
3942
  }
3824
3943
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3825
3944
 
3945
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
3946
+
3826
3947
  // sanity check for n_rot (optional)
3827
3948
  {
3828
3949
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3860,7 +3981,7 @@ static void llm_load_hparams(
3860
3981
  switch (hparams.n_layer) {
3861
3982
  case 22: model.type = e_model::MODEL_1B; break;
3862
3983
  case 26: model.type = e_model::MODEL_3B; break;
3863
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3984
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3864
3985
  case 40: model.type = e_model::MODEL_13B; break;
3865
3986
  case 48: model.type = e_model::MODEL_34B; break;
3866
3987
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3922,14 +4043,6 @@ static void llm_load_hparams(
3922
4043
  default: model.type = e_model::MODEL_UNKNOWN;
3923
4044
  }
3924
4045
  } break;
3925
- case LLM_ARCH_PERSIMMON:
3926
- {
3927
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3928
- switch (hparams.n_layer) {
3929
- case 36: model.type = e_model::MODEL_8B; break;
3930
- default: model.type = e_model::MODEL_UNKNOWN;
3931
- }
3932
- } break;
3933
4046
  case LLM_ARCH_REFACT:
3934
4047
  {
3935
4048
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3962,6 +4075,19 @@ static void llm_load_hparams(
3962
4075
  model.type = e_model::MODEL_335M; break; // bge-large
3963
4076
  }
3964
4077
  } break;
4078
+ case LLM_ARCH_JINA_BERT_V2:
4079
+ {
4080
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4081
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4082
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4083
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4084
+ hparams.f_max_alibi_bias = 8.0f;
4085
+
4086
+ switch (hparams.n_layer) {
4087
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4088
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4089
+ }
4090
+ } break;
3965
4091
  case LLM_ARCH_NOMIC_BERT:
3966
4092
  {
3967
4093
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4058,6 +4184,7 @@ static void llm_load_hparams(
4058
4184
  switch (hparams.n_layer) {
4059
4185
  case 24: model.type = e_model::MODEL_1B; break;
4060
4186
  case 32: model.type = e_model::MODEL_3B; break;
4187
+ case 40: model.type = e_model::MODEL_14B; break;
4061
4188
  default: model.type = e_model::MODEL_UNKNOWN;
4062
4189
  }
4063
4190
  } break;
@@ -4198,6 +4325,65 @@ static void llm_load_hparams(
4198
4325
  default: model.type = e_model::MODEL_UNKNOWN;
4199
4326
  }
4200
4327
  } break;
4328
+ case LLM_ARCH_GPTNEOX:
4329
+ {
4330
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4331
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4332
+ switch (hparams.n_layer) {
4333
+ case 6:
4334
+ switch (hparams.n_ff) {
4335
+ case 512: model.type = e_model::MODEL_14M; break;
4336
+ case 2048: model.type = e_model::MODEL_70M; break;
4337
+ default: model.type = e_model::MODEL_UNKNOWN;
4338
+ } break;
4339
+ case 12:
4340
+ switch (hparams.n_ff) {
4341
+ case 3072: model.type = e_model::MODEL_160M; break;
4342
+ default: model.type = e_model::MODEL_UNKNOWN;
4343
+ } break;
4344
+ case 16:
4345
+ switch (hparams.n_ff) {
4346
+ case 8192: model.type = e_model::MODEL_1B; break;
4347
+ default: model.type = e_model::MODEL_UNKNOWN;
4348
+ } break;
4349
+ case 24:
4350
+ switch (hparams.n_ff) {
4351
+ case 4096: model.type = e_model::MODEL_410M; break;
4352
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4353
+ default: model.type = e_model::MODEL_UNKNOWN;
4354
+ } break;
4355
+ case 32:
4356
+ switch (hparams.n_ff) {
4357
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4358
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4359
+ default: model.type = e_model::MODEL_UNKNOWN;
4360
+ } break;
4361
+ case 36:
4362
+ switch (hparams.n_ff) {
4363
+ case 20480: model.type = e_model::MODEL_12B; break;
4364
+ default: model.type = e_model::MODEL_UNKNOWN;
4365
+ } break;
4366
+ case 44:
4367
+ switch (hparams.n_ff) {
4368
+ case 24576: model.type = e_model::MODEL_20B; break;
4369
+ default: model.type = e_model::MODEL_UNKNOWN;
4370
+ } break;
4371
+ default: model.type = e_model::MODEL_UNKNOWN;
4372
+ }
4373
+ } break;
4374
+ case LLM_ARCH_ARCTIC:
4375
+ {
4376
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4377
+
4378
+ if (hparams.n_expert == 128) {
4379
+ switch (hparams.n_layer) {
4380
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4381
+ default: model.type = e_model::MODEL_UNKNOWN;
4382
+ }
4383
+ } else {
4384
+ model.type = e_model::MODEL_UNKNOWN;
4385
+ }
4386
+ } break;
4201
4387
  default: (void)0;
4202
4388
  }
4203
4389
 
@@ -4383,7 +4569,11 @@ static void llm_load_vocab(
4383
4569
  tokenizer_pre == "starcoder") {
4384
4570
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
4571
  } else if (
4386
- tokenizer_pre == "gpt-2") {
4572
+ tokenizer_pre == "gpt-2" ||
4573
+ tokenizer_pre == "jina-es" ||
4574
+ tokenizer_pre == "jina-de" ||
4575
+ tokenizer_pre == "jina-v2-es" ||
4576
+ tokenizer_pre == "jina-v2-de") {
4387
4577
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
4578
  } else if (
4389
4579
  tokenizer_pre == "refact") {
@@ -4394,6 +4584,9 @@ static void llm_load_vocab(
4394
4584
  } else if (
4395
4585
  tokenizer_pre == "qwen2") {
4396
4586
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4587
+ } else if (
4588
+ tokenizer_pre == "stablelm2") {
4589
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4397
4590
  } else if (
4398
4591
  tokenizer_pre == "olmo") {
4399
4592
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -4515,7 +4708,8 @@ static void llm_load_vocab(
4515
4708
  (t.first == "<|eot_id|>" ||
4516
4709
  t.first == "<|im_end|>" ||
4517
4710
  t.first == "<|end|>" ||
4518
- t.first == "<end_of_turn>"
4711
+ t.first == "<end_of_turn>" ||
4712
+ t.first == "<|endoftext|>"
4519
4713
  )
4520
4714
  ) {
4521
4715
  vocab.special_eot_id = t.second;
@@ -4743,13 +4937,13 @@ static bool llm_load_tensors(
4743
4937
 
4744
4938
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4745
4939
  // calculate the split points
4746
- int device_count = llama_get_device_count();
4940
+ int device_count = llama_get_device_count(model);
4747
4941
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4748
4942
  std::vector<float> splits(device_count);
4749
4943
  if (all_zero) {
4750
4944
  // default split, by free memory
4751
4945
  for (int i = 0; i < device_count; ++i) {
4752
- splits[i] = llama_get_device_memory(i);
4946
+ splits[i] = llama_get_device_memory(model, i);
4753
4947
  }
4754
4948
  } else {
4755
4949
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4769,35 +4963,35 @@ static bool llm_load_tensors(
4769
4963
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4770
4964
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4771
4965
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4772
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4966
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4773
4967
  }
4774
4968
  // assign the output layer
4775
4969
  if (n_gpu_layers > n_layer) {
4776
4970
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4777
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4971
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4778
4972
  } else {
4779
4973
  model.buft_output = llama_default_buffer_type_cpu(true);
4780
4974
  }
4781
4975
  } else {
4782
4976
  ggml_backend_buffer_type_t split_buft;
4783
4977
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4784
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4978
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4785
4979
  } else {
4786
4980
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4787
- split_buft = llama_default_buffer_type_offload(main_gpu);
4981
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4788
4982
  }
4789
4983
  // assign the repeating layers
4790
4984
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4791
4985
  model.buft_layer[i] = {
4792
4986
  split_buft,
4793
- llama_default_buffer_type_offload(main_gpu)
4987
+ llama_default_buffer_type_offload(model, main_gpu)
4794
4988
  };
4795
4989
  }
4796
4990
  // assign the output layer
4797
4991
  if (n_gpu_layers > n_layer) {
4798
4992
  model.buft_output = {
4799
4993
  split_buft,
4800
- llama_default_buffer_type_offload(main_gpu)
4994
+ llama_default_buffer_type_offload(model, main_gpu)
4801
4995
  };
4802
4996
  } else {
4803
4997
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -4841,6 +5035,7 @@ static bool llm_load_tensors(
4841
5035
  // create tensors for the weights
4842
5036
  {
4843
5037
  const int64_t n_embd = hparams.n_embd;
5038
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4844
5039
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4845
5040
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4846
5041
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4875,12 +5070,10 @@ static bool llm_load_tensors(
4875
5070
  {
4876
5071
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4877
5072
  if (model.arch != LLM_ARCH_MINICPM){
4878
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5073
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4879
5074
  // if output is NULL, init from the input tok embed
4880
5075
  if (model.output == NULL) {
4881
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4882
- ml.n_created--; // artificial tensor
4883
- ml.size_data += ggml_nbytes(model.output);
5076
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4884
5077
  }
4885
5078
  }
4886
5079
  }
@@ -4899,10 +5092,10 @@ static bool llm_load_tensors(
4899
5092
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4900
5093
 
4901
5094
  // optional bias tensors
4902
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4903
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4904
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4905
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5095
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5096
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5097
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5098
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4906
5099
 
4907
5100
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4908
5101
 
@@ -4913,7 +5106,7 @@ static bool llm_load_tensors(
4913
5106
  } else {
4914
5107
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4915
5108
 
4916
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5109
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4917
5110
  if (layer.ffn_gate_exps) {
4918
5111
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4919
5112
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -4955,12 +5148,10 @@ static bool llm_load_tensors(
4955
5148
  // output
4956
5149
  {
4957
5150
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4958
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5151
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4959
5152
  // if output is NULL, init from the input tok embed
4960
5153
  if (model.output == NULL) {
4961
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4962
- ml.n_created--; // artificial tensor
4963
- ml.size_data += ggml_nbytes(model.output);
5154
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4964
5155
  }
4965
5156
  }
4966
5157
 
@@ -4983,7 +5174,7 @@ static bool llm_load_tensors(
4983
5174
 
4984
5175
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4985
5176
 
4986
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5177
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4987
5178
  if (layer.ffn_gate_exps) {
4988
5179
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4989
5180
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5085,11 +5276,9 @@ static bool llm_load_tensors(
5085
5276
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5086
5277
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5087
5278
 
5088
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5279
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5089
5280
  if (!model.output) {
5090
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5091
- ml.n_created--; // artificial tensor
5092
- ml.size_data += ggml_nbytes(model.output);
5281
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5093
5282
  }
5094
5283
  }
5095
5284
 
@@ -5102,8 +5291,8 @@ static bool llm_load_tensors(
5102
5291
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5103
5292
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5104
5293
 
5105
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5106
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5294
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5295
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5107
5296
 
5108
5297
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5109
5298
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5121,7 +5310,12 @@ static bool llm_load_tensors(
5121
5310
  {
5122
5311
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5123
5312
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5124
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5313
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5314
+ if (!model.output) {
5315
+ // needs to be on GPU
5316
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5317
+ }
5318
+
5125
5319
  }
5126
5320
 
5127
5321
  for (int i = 0; i < n_layer; ++i) {
@@ -5149,47 +5343,6 @@ static bool llm_load_tensors(
5149
5343
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5150
5344
  }
5151
5345
  } break;
5152
- case LLM_ARCH_PERSIMMON:
5153
- {
5154
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5155
-
5156
- {
5157
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5158
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5159
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5160
- }
5161
-
5162
- for (int i = 0; i < n_layer; ++i) {
5163
- ggml_context * ctx_layer = ctx_for_layer(i);
5164
- ggml_context * ctx_split = ctx_for_layer_split(i);
5165
-
5166
- auto & layer = model.layers[i];
5167
-
5168
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5169
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5170
-
5171
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5172
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5173
-
5174
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5175
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5176
-
5177
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5178
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5179
-
5180
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5181
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5182
-
5183
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5184
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5185
-
5186
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5187
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5188
-
5189
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5190
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5191
- }
5192
- } break;
5193
5346
  case LLM_ARCH_BERT:
5194
5347
  case LLM_ARCH_NOMIC_BERT:
5195
5348
  {
@@ -5242,6 +5395,50 @@ static bool llm_load_tensors(
5242
5395
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5243
5396
  }
5244
5397
  } break;
5398
+ case LLM_ARCH_JINA_BERT_V2:
5399
+ {
5400
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5401
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5402
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5403
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5404
+
5405
+ for (int i = 0; i < n_layer; ++i) {
5406
+ ggml_context * ctx_layer = ctx_for_layer(i);
5407
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5408
+
5409
+ auto & layer = model.layers[i]; // JinaBertLayer
5410
+
5411
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5412
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5413
+
5414
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5416
+
5417
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5418
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5419
+
5420
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5421
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5422
+
5423
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5424
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5425
+
5426
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5427
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5428
+
5429
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5430
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5431
+
5432
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5433
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5434
+
5435
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5436
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5437
+
5438
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5439
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5440
+ }
5441
+ } break;
5245
5442
  case LLM_ARCH_BLOOM:
5246
5443
  {
5247
5444
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5283,18 +5480,16 @@ static bool llm_load_tensors(
5283
5480
  case LLM_ARCH_MPT:
5284
5481
  {
5285
5482
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5286
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5483
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5287
5484
 
5288
5485
  // output
5289
5486
  {
5290
5487
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5291
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5488
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5292
5489
 
5293
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5490
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5294
5491
  if (!model.output) {
5295
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5296
- ml.n_created--; // artificial tensor
5297
- ml.size_data += ggml_nbytes(model.output);
5492
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5298
5493
  }
5299
5494
  }
5300
5495
 
@@ -5305,31 +5500,31 @@ static bool llm_load_tensors(
5305
5500
  auto & layer = model.layers[i];
5306
5501
 
5307
5502
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5308
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5503
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5309
5504
 
5310
5505
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5311
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5506
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5312
5507
 
5313
5508
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5314
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5509
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5315
5510
 
5316
5511
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5317
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5512
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5318
5513
 
5319
5514
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5320
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5515
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5321
5516
 
5322
5517
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5323
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5518
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5324
5519
 
5325
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5326
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5520
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5521
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5327
5522
 
5328
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5523
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5524
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5525
 
5331
5526
  // AWQ ScaleActivation layer
5332
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5527
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5333
5528
  }
5334
5529
  } break;
5335
5530
  case LLM_ARCH_STABLELM:
@@ -5358,17 +5553,17 @@ static bool llm_load_tensors(
5358
5553
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5359
5554
 
5360
5555
  // optional bias tensors, present in Stable LM 2 1.6B
5361
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5362
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5363
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5556
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5557
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5558
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5364
5559
 
5365
5560
  // optional q and k layernorms, present in StableLM 2 12B
5366
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5367
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5561
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5562
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5368
5563
 
5369
5564
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5370
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5371
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5565
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5566
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5372
5567
 
5373
5568
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5374
5569
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5411,12 +5606,10 @@ static bool llm_load_tensors(
5411
5606
  // output
5412
5607
  {
5413
5608
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5414
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5609
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
5610
  // if output is NULL, init from the input tok embed
5416
5611
  if (model.output == NULL) {
5417
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5418
- ml.n_created--; // artificial tensor
5419
- ml.size_data += ggml_nbytes(model.output);
5612
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5420
5613
  }
5421
5614
  }
5422
5615
 
@@ -5514,8 +5707,8 @@ static bool llm_load_tensors(
5514
5707
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5515
5708
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5516
5709
 
5517
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5518
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5710
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5711
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5519
5712
 
5520
5713
  if (layer.wqkv == nullptr) {
5521
5714
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5552,17 +5745,20 @@ static bool llm_load_tensors(
5552
5745
  ggml_context* ctx_layer = ctx_for_layer(i);
5553
5746
  ggml_context* ctx_split = ctx_for_layer_split(i);
5554
5747
 
5555
- auto& layer = model.layers[i];
5748
+ auto & layer = model.layers[i];
5556
5749
 
5557
5750
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5558
5751
 
5559
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5560
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5752
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5753
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5561
5754
 
5562
5755
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5563
5756
 
5564
5757
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5565
5758
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5759
+
5760
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5761
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5566
5762
  }
5567
5763
  } break;
5568
5764
  case LLM_ARCH_PLAMO:
@@ -5731,9 +5927,7 @@ static bool llm_load_tensors(
5731
5927
 
5732
5928
  // output
5733
5929
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5734
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5735
- ml.n_created--; // artificial tensor
5736
- ml.size_data += ggml_nbytes(model.output);
5930
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5737
5931
 
5738
5932
  const int64_t n_ff = hparams.n_ff;
5739
5933
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5768,12 +5962,10 @@ static bool llm_load_tensors(
5768
5962
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5769
5963
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5770
5964
 
5771
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5965
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5772
5966
  // if output is NULL, init from the input tok embed
5773
5967
  if (model.output == NULL) {
5774
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5775
- ml.n_created--; // artificial tensor
5776
- ml.size_data += ggml_nbytes(model.output);
5968
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5777
5969
  }
5778
5970
 
5779
5971
  }
@@ -5824,12 +6016,10 @@ static bool llm_load_tensors(
5824
6016
  {
5825
6017
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5826
6018
 
5827
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6019
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5828
6020
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5829
6021
  if (model.output == NULL) {
5830
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5831
- ml.n_created--; // artificial tensor
5832
- ml.size_data += ggml_nbytes(model.output);
6022
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5833
6023
  }
5834
6024
  }
5835
6025
 
@@ -5890,9 +6080,7 @@ static bool llm_load_tensors(
5890
6080
  {
5891
6081
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5892
6082
  // init output from the input tok embed
5893
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5894
- ml.n_created--; // artificial tensor
5895
- ml.size_data += ggml_nbytes(model.output);
6083
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5896
6084
  }
5897
6085
 
5898
6086
  for (int i = 0; i < n_layer; ++i) {
@@ -5924,12 +6112,10 @@ static bool llm_load_tensors(
5924
6112
 
5925
6113
  // output
5926
6114
  {
5927
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6115
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5928
6116
  // if output is NULL, init from the input tok embed
5929
6117
  if (model.output == NULL) {
5930
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5931
- ml.n_created--; // artificial tensor
5932
- ml.size_data += ggml_nbytes(model.output);
6118
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5933
6119
  }
5934
6120
  }
5935
6121
 
@@ -5949,6 +6135,81 @@ static bool llm_load_tensors(
5949
6135
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5950
6136
  }
5951
6137
  } break;
6138
+ case LLM_ARCH_GPTNEOX:
6139
+ {
6140
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6141
+ // output
6142
+ {
6143
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6144
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6145
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6146
+ }
6147
+
6148
+ for (int i = 0; i < n_layer; ++i) {
6149
+ ggml_context * ctx_layer = ctx_for_layer(i);
6150
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6151
+
6152
+ auto & layer = model.layers[i];
6153
+
6154
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6155
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6156
+
6157
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6158
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6159
+
6160
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6161
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6162
+
6163
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6164
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6165
+
6166
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6167
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6168
+
6169
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6170
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6171
+ }
6172
+ } break;
6173
+ case LLM_ARCH_ARCTIC:
6174
+ {
6175
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6176
+
6177
+ // output
6178
+ {
6179
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6180
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6181
+ // if output is NULL, init from the input tok embed
6182
+ if (model.output == NULL) {
6183
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6184
+ }
6185
+ }
6186
+
6187
+ for (int i = 0; i < n_layer; ++i) {
6188
+ ggml_context * ctx_layer = ctx_for_layer(i);
6189
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6190
+
6191
+ auto & layer = model.layers[i];
6192
+
6193
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6194
+
6195
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6196
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6197
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6198
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6199
+
6200
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6201
+
6202
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6203
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6204
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6205
+
6206
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6207
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6208
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6209
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6210
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
+ }
6212
+ } break;
5952
6213
  default:
5953
6214
  throw std::runtime_error("unknown architecture");
5954
6215
  }
@@ -6213,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6213
6474
 
6214
6475
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6215
6476
  } else {
6216
- #ifdef GGML_USE_MPI
6217
- GGML_ASSERT(false && "not implemented");
6218
- #endif
6219
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6477
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6220
6478
  inpL = lctx.inp_embd;
6221
6479
  ggml_set_input(lctx.inp_embd);
6222
6480
  }
@@ -6318,7 +6576,7 @@ static struct ggml_tensor * llm_build_ffn(
6318
6576
  llm_ffn_gate_type type_gate,
6319
6577
  const llm_build_cb & cb,
6320
6578
  int il) {
6321
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6579
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6322
6580
  cb(tmp, "ffn_up", il);
6323
6581
 
6324
6582
  if (up_b) {
@@ -6500,7 +6758,6 @@ static struct ggml_tensor * llm_build_kqv(
6500
6758
  struct ggml_tensor * wo_b,
6501
6759
  struct ggml_tensor * q_cur,
6502
6760
  struct ggml_tensor * kq_mask,
6503
- struct ggml_tensor * kq_pos,
6504
6761
  int32_t n_tokens,
6505
6762
  int32_t n_kv,
6506
6763
  float kq_scale,
@@ -6512,6 +6769,7 @@ static struct ggml_tensor * llm_build_kqv(
6512
6769
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6513
6770
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6514
6771
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6772
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6515
6773
 
6516
6774
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6517
6775
  cb(q, "q", il);
@@ -6530,31 +6788,27 @@ static struct ggml_tensor * llm_build_kqv(
6530
6788
  GGML_UNUSED(model);
6531
6789
  GGML_UNUSED(n_ctx);
6532
6790
 
6533
- // note: if this assert triggers, then some check has failed earlier
6534
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6536
-
6537
6791
  // split cached v into n_head heads (not transposed)
6538
6792
  struct ggml_tensor * v =
6539
6793
  ggml_view_3d(ctx, kv.v_l[il],
6540
6794
  n_embd_head_v, n_kv, n_head_kv,
6541
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6795
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6796
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6543
6797
  0);
6544
6798
  cb(v, "v", il);
6545
6799
 
6546
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6800
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6547
6801
 
6548
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6802
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6549
6803
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
6804
  }
6551
6805
 
6552
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6806
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6553
6807
  } else {
6554
6808
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
6809
  cb(kq, "kq", il);
6556
6810
 
6557
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6811
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6558
6812
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6559
6813
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6560
6814
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6574,28 +6828,8 @@ static struct ggml_tensor * llm_build_kqv(
6574
6828
  kq = ggml_scale(ctx, kq, 30);
6575
6829
  }
6576
6830
 
6577
- #if defined(GGML_USE_KOMPUTE)
6578
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6579
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6580
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6581
- if (hparams.use_alibi) {
6582
- kq = ggml_scale(ctx, kq, kq_scale);
6583
- cb(kq, "kq_scaled", il);
6584
-
6585
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
- cb(kq, "kq_scaled_alibi", il);
6587
-
6588
- kq = ggml_add(ctx, kq, kq_mask);
6589
- cb(kq, "kq_masked", il);
6590
-
6591
- kq = ggml_soft_max(ctx, kq);
6592
- cb(kq, "kq_soft_max", il);
6593
- } else
6594
- #endif
6595
- {
6596
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
- cb(kq, "kq_soft_max_ext", il);
6598
- }
6831
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6832
+ cb(kq, "kq_soft_max_ext", il);
6599
6833
 
6600
6834
  GGML_ASSERT(kv.size == n_ctx);
6601
6835
 
@@ -6614,7 +6848,7 @@ static struct ggml_tensor * llm_build_kqv(
6614
6848
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
6849
  cb(kqv_merged, "kqv_merged", il);
6616
6850
 
6617
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6851
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6618
6852
  cb(cur, "kqv_merged_cont", il);
6619
6853
  }
6620
6854
 
@@ -6645,7 +6879,6 @@ static struct ggml_tensor * llm_build_kv(
6645
6879
  struct ggml_tensor * v_cur,
6646
6880
  struct ggml_tensor * q_cur,
6647
6881
  struct ggml_tensor * kq_mask,
6648
- struct ggml_tensor * kq_pos,
6649
6882
  int32_t n_tokens,
6650
6883
  int32_t kv_head,
6651
6884
  int32_t n_kv,
@@ -6664,7 +6897,7 @@ static struct ggml_tensor * llm_build_kv(
6664
6897
  struct ggml_tensor * cur;
6665
6898
 
6666
6899
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6900
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6668
6901
  cb(cur, "kqv_out", il);
6669
6902
 
6670
6903
  return cur;
@@ -6771,18 +7004,17 @@ struct llm_build_context {
6771
7004
 
6772
7005
  ctx0 = ggml_init(params);
6773
7006
 
6774
- lctx.inp_tokens = nullptr;
6775
- lctx.inp_embd = nullptr;
6776
- lctx.inp_pos = nullptr;
7007
+ lctx.inp_tokens = nullptr;
7008
+ lctx.inp_embd = nullptr;
7009
+ lctx.inp_pos = nullptr;
6777
7010
  lctx.inp_out_ids = nullptr;
6778
7011
  lctx.inp_KQ_mask = nullptr;
6779
- lctx.inp_KQ_pos = nullptr;
6780
7012
  lctx.inp_K_shift = nullptr;
6781
- lctx.inp_mean = nullptr;
6782
- lctx.inp_cls = nullptr;
6783
- lctx.inp_s_copy = nullptr;
6784
- lctx.inp_s_mask = nullptr;
6785
- lctx.inp_s_seq = nullptr;
7013
+ lctx.inp_mean = nullptr;
7014
+ lctx.inp_cls = nullptr;
7015
+ lctx.inp_s_copy = nullptr;
7016
+ lctx.inp_s_mask = nullptr;
7017
+ lctx.inp_s_seq = nullptr;
6786
7018
  }
6787
7019
 
6788
7020
  void free() {
@@ -6801,17 +7033,20 @@ struct llm_build_context {
6801
7033
  cb(lctx.inp_K_shift, "K_shift", -1);
6802
7034
  ggml_set_input(lctx.inp_K_shift);
6803
7035
 
7036
+
6804
7037
  for (int il = 0; il < n_layer; ++il) {
7038
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6805
7039
  struct ggml_tensor * tmp =
6806
7040
  // we rotate only the first n_rot dimensions
6807
- ggml_rope_custom_inplace(ctx0,
7041
+ ggml_rope_ext_inplace(ctx0,
6808
7042
  ggml_view_3d(ctx0, kv_self.k_l[il],
6809
7043
  n_embd_head_k, n_head_kv, n_ctx,
6810
7044
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6811
7045
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6812
7046
  0),
6813
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6814
7048
  ext_factor, attn_factor, beta_fast, beta_slow);
7049
+
6815
7050
  cb(tmp, "K_shifted", il);
6816
7051
  ggml_build_forward_expand(gf, tmp);
6817
7052
  }
@@ -6914,6 +7149,17 @@ struct llm_build_context {
6914
7149
  return lctx.inp_pos;
6915
7150
  }
6916
7151
 
7152
+ struct ggml_tensor * build_rope_factors(int il) {
7153
+ // choose long/short freq factors based on the context size
7154
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7155
+
7156
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7157
+ return model.layers[il].rope_long;
7158
+ }
7159
+
7160
+ return model.layers[il].rope_short;
7161
+ }
7162
+
6917
7163
  struct ggml_tensor * build_inp_out_ids() {
6918
7164
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6919
7165
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -6932,19 +7178,6 @@ struct llm_build_context {
6932
7178
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6933
7179
  }
6934
7180
 
6935
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
- if (causal) {
6937
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
- } else {
6939
- // TODO: this will be needed for ALiBi-based BERT models
6940
- // https://github.com/ggerganov/llama.cpp/pull/6826
6941
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
- }
6943
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6944
- ggml_set_input(lctx.inp_KQ_pos);
6945
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6946
- }
6947
-
6948
7181
  struct ggml_tensor * build_inp_mean() {
6949
7182
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6950
7183
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7034,15 +7267,15 @@ struct llm_build_context {
7034
7267
  cb(Vcur, "Vcur", il);
7035
7268
  }
7036
7269
 
7037
- Qcur = ggml_rope_custom(
7038
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7270
+ Qcur = ggml_rope_ext(
7271
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7039
7272
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7040
7273
  ext_factor, attn_factor, beta_fast, beta_slow
7041
7274
  );
7042
7275
  cb(Qcur, "Qcur", il);
7043
7276
 
7044
- Kcur = ggml_rope_custom(
7045
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7277
+ Kcur = ggml_rope_ext(
7278
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7046
7279
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
7280
  ext_factor, attn_factor, beta_fast, beta_slow
7048
7281
  );
@@ -7050,7 +7283,7 @@ struct llm_build_context {
7050
7283
 
7051
7284
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7052
7285
  model.layers[il].wo, model.layers[il].bo,
7053
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7286
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7054
7287
  }
7055
7288
 
7056
7289
  if (il == n_layer - 1) {
@@ -7143,9 +7376,6 @@ struct llm_build_context {
7143
7376
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7144
7377
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7145
7378
 
7146
- // positions of the tokens in the KV cache
7147
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7148
-
7149
7379
  for (int il = 0; il < n_layer; ++il) {
7150
7380
  struct ggml_tensor * inpSA = inpL;
7151
7381
 
@@ -7167,13 +7397,13 @@ struct llm_build_context {
7167
7397
 
7168
7398
  switch (model.type) {
7169
7399
  case MODEL_7B:
7170
- Qcur = ggml_rope_custom(
7171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ Qcur = ggml_rope_ext(
7401
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7172
7402
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7173
7403
  ext_factor, attn_factor, beta_fast, beta_slow
7174
7404
  );
7175
- Kcur = ggml_rope_custom(
7176
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7405
+ Kcur = ggml_rope_ext(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7177
7407
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7178
7408
  ext_factor, attn_factor, beta_fast, beta_slow
7179
7409
  );
@@ -7190,7 +7420,7 @@ struct llm_build_context {
7190
7420
 
7191
7421
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7192
7422
  model.layers[il].wo, NULL,
7193
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7423
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7194
7424
  }
7195
7425
 
7196
7426
  if (il == n_layer - 1) {
@@ -7260,9 +7490,6 @@ struct llm_build_context {
7260
7490
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7261
7491
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7262
7492
 
7263
- // positions of the tokens in the KV cache
7264
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7265
-
7266
7493
  for (int il = 0; il < n_layer; ++il) {
7267
7494
  struct ggml_tensor * inpSA = inpL;
7268
7495
 
@@ -7282,22 +7509,22 @@ struct llm_build_context {
7282
7509
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7283
7510
  cb(Vcur, "Vcur", il);
7284
7511
 
7285
- Qcur = ggml_rope_custom(
7286
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7512
+ Qcur = ggml_rope_ext(
7513
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7287
7514
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7288
7515
  ext_factor, attn_factor, beta_fast, beta_slow
7289
7516
  );
7290
7517
  cb(Qcur, "Qcur", il);
7291
7518
 
7292
- Kcur = ggml_rope_custom(
7293
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7519
+ Kcur = ggml_rope_ext(
7520
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7294
7521
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7295
7522
  ext_factor, attn_factor, beta_fast, beta_slow
7296
7523
  );
7297
7524
  cb(Kcur, "Kcur", il);
7298
7525
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7299
7526
  model.layers[il].wo, NULL,
7300
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7527
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7301
7528
  }
7302
7529
 
7303
7530
  if (il == n_layer - 1) {
@@ -7403,21 +7630,21 @@ struct llm_build_context {
7403
7630
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7404
7631
 
7405
7632
  // using mode = 2 for neox mode
7406
- Qcur = ggml_rope_custom(
7407
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7633
+ Qcur = ggml_rope_ext(
7634
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7408
7635
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7409
7636
  );
7410
7637
  cb(Qcur, "Qcur", il);
7411
7638
 
7412
- Kcur = ggml_rope_custom(
7413
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7639
+ Kcur = ggml_rope_ext(
7640
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7414
7641
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7415
7642
  );
7416
7643
  cb(Kcur, "Kcur", il);
7417
7644
 
7418
7645
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7419
7646
  model.layers[il].wo, NULL,
7420
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7647
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7421
7648
  }
7422
7649
 
7423
7650
  if (il == n_layer - 1) {
@@ -7526,15 +7753,15 @@ struct llm_build_context {
7526
7753
  cb(Vcur, "Vcur", il);
7527
7754
  }
7528
7755
 
7529
- Qcur = ggml_rope_custom(
7530
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7756
+ Qcur = ggml_rope_ext(
7757
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7531
7758
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7532
7759
  ext_factor, attn_factor, beta_fast, beta_slow
7533
7760
  );
7534
7761
  cb(Qcur, "Qcur", il);
7535
7762
 
7536
- Kcur = ggml_rope_custom(
7537
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7763
+ Kcur = ggml_rope_ext(
7764
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7538
7765
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7539
7766
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7767
  );
@@ -7542,7 +7769,7 @@ struct llm_build_context {
7542
7769
 
7543
7770
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7544
7771
  model.layers[il].wo, model.layers[il].bo,
7545
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7772
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7546
7773
  }
7547
7774
 
7548
7775
  if (il == n_layer - 1) {
@@ -7678,15 +7905,15 @@ struct llm_build_context {
7678
7905
  cb(Kcur, "Kcur", il);
7679
7906
  cb(Vcur, "Vcur", il);
7680
7907
 
7681
- Qcur = ggml_rope_custom(
7682
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7908
+ Qcur = ggml_rope_ext(
7909
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7683
7910
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7684
7911
  ext_factor, attn_factor, beta_fast, beta_slow
7685
7912
  );
7686
7913
  cb(Qcur, "Qcur", il);
7687
7914
 
7688
- Kcur = ggml_rope_custom(
7689
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7915
+ Kcur = ggml_rope_ext(
7916
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7690
7917
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7691
7918
  ext_factor, attn_factor, beta_fast, beta_slow
7692
7919
  );
@@ -7694,7 +7921,7 @@ struct llm_build_context {
7694
7921
 
7695
7922
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
7923
  model.layers[il].wo, NULL,
7697
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7924
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7698
7925
  }
7699
7926
 
7700
7927
  if (il == n_layer - 1) {
@@ -7806,7 +8033,7 @@ struct llm_build_context {
7806
8033
 
7807
8034
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7808
8035
  model.layers[il].wo, model.layers[il].bo,
7809
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8036
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7810
8037
  }
7811
8038
 
7812
8039
  if (il == n_layer - 1) {
@@ -7855,266 +8082,56 @@ struct llm_build_context {
7855
8082
  return gf;
7856
8083
  }
7857
8084
 
7858
- struct ggml_cgraph * build_persimmon() {
8085
+ struct ggml_cgraph * build_refact() {
7859
8086
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7860
8087
 
7861
8088
  const int64_t n_embd_head = hparams.n_embd_head_v;
7862
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7863
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
8089
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7864
8090
 
7865
8091
  struct ggml_tensor * cur;
7866
8092
  struct ggml_tensor * inpL;
7867
8093
 
7868
8094
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7869
8095
 
7870
- // inp_pos - contains the positions
7871
- struct ggml_tensor * inp_pos = build_inp_pos();
7872
-
7873
8096
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7874
8097
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7875
8098
 
7876
8099
  for (int il = 0; il < n_layer; ++il) {
7877
- struct ggml_tensor * residual = inpL;
8100
+ struct ggml_tensor * inpSA = inpL;
7878
8101
 
7879
8102
  cur = llm_build_norm(ctx0, inpL, hparams,
7880
- model.layers[il].attn_norm,
7881
- model.layers[il].attn_norm_b,
7882
- LLM_NORM, cb, il);
8103
+ model.layers[il].attn_norm, NULL,
8104
+ LLM_NORM_RMS, cb, il);
7883
8105
  cb(cur, "attn_norm", il);
7884
8106
 
7885
- // self attention
8107
+ // self-attention
7886
8108
  {
7887
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7888
- cb(cur, "wqkv", il);
8109
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8110
+ cb(Qcur, "Qcur", il);
7889
8111
 
7890
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7891
- cb(cur, "bqkv", il);
8112
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8113
+ cb(Kcur, "Kcur", il);
7892
8114
 
7893
- // split qkv
7894
- GGML_ASSERT(n_head_kv == n_head);
8115
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8116
+ cb(Vcur, "Vcur", il);
7895
8117
 
7896
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7897
- cb(tmpqkv, "tmpqkv", il);
8118
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8119
+ cb(Kcur, "Kcur", il);
7898
8120
 
7899
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7900
- cb(tmpqkv_perm, "tmpqkv", il);
8121
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8122
+ cb(Qcur, "Qcur", il);
7901
8123
 
7902
- struct ggml_tensor * tmpq = ggml_view_3d(
7903
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7904
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7905
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7906
- 0
7907
- );
7908
- cb(tmpq, "tmpq", il);
8124
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8125
+ model.layers[il].wo, NULL,
8126
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8127
+ }
7909
8128
 
7910
- struct ggml_tensor * tmpk = ggml_view_3d(
7911
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7912
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7913
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7914
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7915
- );
7916
- cb(tmpk, "tmpk", il);
7917
-
7918
- // Q/K Layernorm
7919
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7920
- model.layers[il].attn_q_norm,
7921
- model.layers[il].attn_q_norm_b,
7922
- LLM_NORM, cb, il);
7923
- cb(tmpq, "tmpq", il);
7924
-
7925
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7926
- model.layers[il].attn_k_norm,
7927
- model.layers[il].attn_k_norm_b,
7928
- LLM_NORM, cb, il);
7929
- cb(tmpk, "tmpk", il);
7930
-
7931
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7932
- struct ggml_tensor * qrot = ggml_view_3d(
7933
- ctx0, tmpq, n_rot, n_head, n_tokens,
7934
- ggml_element_size(tmpq) * n_embd_head,
7935
- ggml_element_size(tmpq) * n_embd_head * n_head,
7936
- 0
7937
- );
7938
- cb(qrot, "qrot", il);
7939
-
7940
- struct ggml_tensor * krot = ggml_view_3d(
7941
- ctx0, tmpk, n_rot, n_head, n_tokens,
7942
- ggml_element_size(tmpk) * n_embd_head,
7943
- ggml_element_size(tmpk) * n_embd_head * n_head,
7944
- 0
7945
- );
7946
- cb(krot, "krot", il);
7947
-
7948
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
7949
- struct ggml_tensor * qpass = ggml_view_3d(
7950
- ctx0, tmpq, n_rot, n_head, n_tokens,
7951
- ggml_element_size(tmpq) * n_embd_head,
7952
- ggml_element_size(tmpq) * n_embd_head * n_head,
7953
- ggml_element_size(tmpq) * n_rot
7954
- );
7955
- cb(qpass, "qpass", il);
7956
-
7957
- struct ggml_tensor * kpass = ggml_view_3d(
7958
- ctx0, tmpk, n_rot, n_head, n_tokens,
7959
- ggml_element_size(tmpk) * n_embd_head,
7960
- ggml_element_size(tmpk) * n_embd_head * n_head,
7961
- ggml_element_size(tmpk) * n_rot
7962
- );
7963
- cb(kpass, "kpass", il);
7964
-
7965
- struct ggml_tensor * qrotated = ggml_rope_custom(
7966
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7967
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7968
- );
7969
- cb(qrotated, "qrotated", il);
7970
-
7971
- struct ggml_tensor * krotated = ggml_rope_custom(
7972
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7973
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7974
- );
7975
- cb(krotated, "krotated", il);
7976
-
7977
- // ggml currently only supports concatenation on dim=2
7978
- // so we need to permute qrot, qpass, concat, then permute back.
7979
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
7980
- cb(qrotated, "qrotated", il);
7981
-
7982
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
7983
- cb(krotated, "krotated", il);
7984
-
7985
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
7986
- cb(qpass, "qpass", il);
7987
-
7988
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
7989
- cb(kpass, "kpass", il);
7990
-
7991
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
7992
- cb(Qcur, "Qcur", il);
7993
-
7994
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
7995
- cb(Kcur, "Kcur", il);
7996
-
7997
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
7998
- cb(Q, "Q", il);
7999
-
8000
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8001
- cb(Kcur, "Kcur", il);
8002
-
8003
- struct ggml_tensor * Vcur = ggml_view_3d(
8004
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8005
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8006
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8007
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8008
- );
8009
- cb(Vcur, "Vcur", il);
8010
-
8011
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8012
- model.layers[il].wo, model.layers[il].bo,
8013
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8014
- }
8015
-
8016
- if (il == n_layer - 1) {
8017
- // skip computing output for unused tokens
8018
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8019
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8020
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8021
- }
8022
-
8023
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8024
- cb(ffn_inp, "ffn_inp", il);
8025
-
8026
- // feed-forward network
8027
- {
8028
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8029
- model.layers[il].ffn_norm,
8030
- model.layers[il].ffn_norm_b,
8031
- LLM_NORM, cb, il);
8032
- cb(cur, "ffn_norm", il);
8033
-
8034
- cur = llm_build_ffn(ctx0, cur,
8035
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8036
- NULL, NULL,
8037
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8038
- NULL,
8039
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8040
- cb(cur, "ffn_out", il);
8041
- }
8042
-
8043
- cur = ggml_add(ctx0, cur, ffn_inp);
8044
- cb(cur, "l_out", il);
8045
-
8046
- inpL = cur;
8047
- }
8048
-
8049
- cur = inpL;
8050
-
8051
- cur = llm_build_norm(ctx0, cur, hparams,
8052
- model.output_norm,
8053
- model.output_norm_b,
8054
- LLM_NORM, cb, -1);
8055
- cb(cur, "result_norm", -1);
8056
-
8057
- cur = ggml_mul_mat(ctx0, model.output, cur);
8058
- cb(cur, "result_output", -1);
8059
-
8060
- ggml_build_forward_expand(gf, cur);
8061
-
8062
- return gf;
8063
- }
8064
-
8065
- struct ggml_cgraph * build_refact() {
8066
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8067
-
8068
- const int64_t n_embd_head = hparams.n_embd_head_v;
8069
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8070
-
8071
- struct ggml_tensor * cur;
8072
- struct ggml_tensor * inpL;
8073
-
8074
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8075
-
8076
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8077
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8078
-
8079
- // positions of the tokens in the KV cache
8080
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8081
-
8082
- for (int il = 0; il < n_layer; ++il) {
8083
- struct ggml_tensor * inpSA = inpL;
8084
-
8085
- cur = llm_build_norm(ctx0, inpL, hparams,
8086
- model.layers[il].attn_norm, NULL,
8087
- LLM_NORM_RMS, cb, il);
8088
- cb(cur, "attn_norm", il);
8089
-
8090
- // self-attention
8091
- {
8092
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8093
- cb(Qcur, "Qcur", il);
8094
-
8095
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8096
- cb(Kcur, "Kcur", il);
8097
-
8098
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8099
- cb(Vcur, "Vcur", il);
8100
-
8101
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8102
- cb(Kcur, "Kcur", il);
8103
-
8104
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8105
- cb(Qcur, "Qcur", il);
8106
-
8107
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8108
- model.layers[il].wo, NULL,
8109
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8110
- }
8111
-
8112
- if (il == n_layer - 1) {
8113
- // skip computing output for unused tokens
8114
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8115
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8116
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8117
- }
8129
+ if (il == n_layer - 1) {
8130
+ // skip computing output for unused tokens
8131
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8132
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8133
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8134
+ }
8118
8135
 
8119
8136
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8120
8137
  cb(ffn_inp, "ffn_inp", il);
@@ -8168,8 +8185,11 @@ struct llm_build_context {
8168
8185
 
8169
8186
  struct ggml_tensor * cur;
8170
8187
  struct ggml_tensor * inpL;
8188
+ struct ggml_tensor * inp_pos = nullptr;
8171
8189
 
8172
- struct ggml_tensor * inp_pos = build_inp_pos();
8190
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8191
+ inp_pos = build_inp_pos();
8192
+ }
8173
8193
  struct ggml_tensor * inp_mean = build_inp_mean();
8174
8194
  struct ggml_tensor * inp_cls = build_inp_cls();
8175
8195
 
@@ -8200,13 +8220,26 @@ struct llm_build_context {
8200
8220
  struct ggml_tensor * Vcur;
8201
8221
 
8202
8222
  // self-attention
8203
- if (model.arch == LLM_ARCH_BERT) {
8223
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8204
8224
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8205
8225
  cb(Qcur, "Qcur", il);
8206
8226
 
8227
+ if (model.layers[il].attn_q_norm) {
8228
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8229
+ model.layers[il].attn_q_norm,
8230
+ model.layers[il].attn_q_norm_b,
8231
+ LLM_NORM, cb, il);
8232
+ }
8233
+
8207
8234
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8208
8235
  cb(Kcur, "Kcur", il);
8209
8236
 
8237
+ if (model.layers[il].attn_k_norm) {
8238
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8239
+ model.layers[il].attn_k_norm,
8240
+ model.layers[il].attn_k_norm_b,
8241
+ LLM_NORM, cb, il);
8242
+ }
8210
8243
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8211
8244
  cb(Vcur, "Vcur", il);
8212
8245
 
@@ -8225,15 +8258,15 @@ struct llm_build_context {
8225
8258
  cb(Kcur, "Kcur", il);
8226
8259
  cb(Vcur, "Vcur", il);
8227
8260
 
8228
- Qcur = ggml_rope_custom(
8229
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8261
+ Qcur = ggml_rope_ext(
8262
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8230
8263
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8231
8264
  ext_factor, attn_factor, beta_fast, beta_slow
8232
8265
  );
8233
8266
  cb(Qcur, "Qcur", il);
8234
8267
 
8235
- Kcur = ggml_rope_custom(
8236
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8268
+ Kcur = ggml_rope_ext(
8269
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8237
8270
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8238
8271
  ext_factor, attn_factor, beta_fast, beta_slow
8239
8272
  );
@@ -8246,7 +8279,7 @@ struct llm_build_context {
8246
8279
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8247
8280
  cb(kq, "kq", il);
8248
8281
 
8249
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8282
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8250
8283
  cb(kq, "kq_soft_max_ext", il);
8251
8284
 
8252
8285
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8297,6 +8330,13 @@ struct llm_build_context {
8297
8330
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8298
8331
  NULL,
8299
8332
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8333
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8334
+ cur = llm_build_ffn(ctx0, cur,
8335
+ model.layers[il].ffn_up, NULL,
8336
+ model.layers[il].ffn_gate, NULL,
8337
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8338
+ NULL,
8339
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8300
8340
  } else {
8301
8341
  cur = llm_build_ffn(ctx0, cur,
8302
8342
  model.layers[il].ffn_up, NULL,
@@ -8363,9 +8403,6 @@ struct llm_build_context {
8363
8403
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8364
8404
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8365
8405
 
8366
- // positions of the tokens in the KV cache
8367
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8368
-
8369
8406
  inpL = llm_build_norm(ctx0, inpL, hparams,
8370
8407
  model.tok_norm,
8371
8408
  model.tok_norm_b,
@@ -8399,7 +8436,7 @@ struct llm_build_context {
8399
8436
 
8400
8437
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8401
8438
  model.layers[il].wo, model.layers[il].bo,
8402
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8439
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8403
8440
  }
8404
8441
 
8405
8442
  if (il == n_layer - 1) {
@@ -8464,9 +8501,6 @@ struct llm_build_context {
8464
8501
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8465
8502
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8466
8503
 
8467
- // positions of the tokens in the KV cache
8468
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8469
-
8470
8504
  if (model.pos_embd) {
8471
8505
  // inp_pos - contains the positions
8472
8506
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8530,13 +8564,13 @@ struct llm_build_context {
8530
8564
 
8531
8565
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
8566
  model.layers[il].wo, model.layers[il].bo,
8533
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8567
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8534
8568
  } else {
8535
8569
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8536
8570
 
8537
8571
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8538
8572
  model.layers[il].wo, model.layers[il].bo,
8539
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8573
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8540
8574
  }
8541
8575
  }
8542
8576
 
@@ -8664,15 +8698,15 @@ struct llm_build_context {
8664
8698
  }
8665
8699
 
8666
8700
 
8667
- Qcur = ggml_rope_custom(
8668
- ctx0, Qcur, inp_pos,
8701
+ Qcur = ggml_rope_ext(
8702
+ ctx0, Qcur, inp_pos, nullptr,
8669
8703
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8670
8704
  ext_factor, attn_factor, beta_fast, beta_slow
8671
8705
  );
8672
8706
  cb(Qcur, "Qcur", il);
8673
8707
 
8674
- Kcur = ggml_rope_custom(
8675
- ctx0, Kcur, inp_pos,
8708
+ Kcur = ggml_rope_ext(
8709
+ ctx0, Kcur, inp_pos, nullptr,
8676
8710
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8677
8711
  ext_factor, attn_factor, beta_fast, beta_slow
8678
8712
  );
@@ -8680,7 +8714,7 @@ struct llm_build_context {
8680
8714
 
8681
8715
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8682
8716
  model.layers[il].wo, NULL,
8683
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8717
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8684
8718
  }
8685
8719
 
8686
8720
  if (il == n_layer - 1) {
@@ -8784,21 +8818,21 @@ struct llm_build_context {
8784
8818
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8785
8819
 
8786
8820
  // using mode = 2 for neox mode
8787
- Qcur = ggml_rope_custom(
8788
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8821
+ Qcur = ggml_rope_ext(
8822
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8789
8823
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8790
8824
  );
8791
8825
  cb(Qcur, "Qcur", il);
8792
8826
 
8793
- Kcur = ggml_rope_custom(
8794
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8827
+ Kcur = ggml_rope_ext(
8828
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8795
8829
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8796
8830
  );
8797
8831
  cb(Kcur, "Kcur", il);
8798
8832
 
8799
8833
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8800
8834
  model.layers[il].wo, NULL,
8801
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8835
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8802
8836
  }
8803
8837
 
8804
8838
  if (il == n_layer - 1) {
@@ -8895,15 +8929,15 @@ struct llm_build_context {
8895
8929
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8896
8930
  cb(Vcur, "Vcur", il);
8897
8931
 
8898
- Qcur = ggml_rope_custom(
8899
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8932
+ Qcur = ggml_rope_ext(
8933
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8900
8934
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8901
8935
  ext_factor, attn_factor, beta_fast, beta_slow
8902
8936
  );
8903
8937
  cb(Qcur, "Qcur", il);
8904
8938
 
8905
- Kcur = ggml_rope_custom(
8906
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8939
+ Kcur = ggml_rope_ext(
8940
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8907
8941
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8908
8942
  ext_factor, attn_factor, beta_fast, beta_slow
8909
8943
  );
@@ -8911,7 +8945,7 @@ struct llm_build_context {
8911
8945
 
8912
8946
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8913
8947
  model.layers[il].wo, model.layers[il].bo,
8914
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8948
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8915
8949
  }
8916
8950
 
8917
8951
  if (il == n_layer - 1) {
@@ -9009,15 +9043,15 @@ struct llm_build_context {
9009
9043
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9010
9044
  cb(Vcur, "Vcur", il);
9011
9045
 
9012
- Qcur = ggml_rope_custom(
9013
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9046
+ Qcur = ggml_rope_ext(
9047
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9014
9048
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9015
9049
  ext_factor, attn_factor, beta_fast, beta_slow
9016
9050
  );
9017
9051
  cb(Qcur, "Qcur", il);
9018
9052
 
9019
- Kcur = ggml_rope_custom(
9020
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9053
+ Kcur = ggml_rope_ext(
9054
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9021
9055
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9022
9056
  ext_factor, attn_factor, beta_fast, beta_slow
9023
9057
  );
@@ -9025,7 +9059,7 @@ struct llm_build_context {
9025
9059
 
9026
9060
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9027
9061
  model.layers[il].wo, model.layers[il].bo,
9028
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9062
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9029
9063
  }
9030
9064
 
9031
9065
  if (il == n_layer - 1) {
@@ -9161,8 +9195,8 @@ struct llm_build_context {
9161
9195
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9162
9196
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9163
9197
 
9164
- Qcur = ggml_rope_custom(
9165
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9198
+ Qcur = ggml_rope_ext(
9199
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9166
9200
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9167
9201
  );
9168
9202
  cb(Qcur, "Qcur", il);
@@ -9172,15 +9206,15 @@ struct llm_build_context {
9172
9206
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9173
9207
  cb(Qcur, "Qcur", il);
9174
9208
 
9175
- Kcur = ggml_rope_custom(
9176
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9209
+ Kcur = ggml_rope_ext(
9210
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9177
9211
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9178
9212
  );
9179
9213
  cb(Kcur, "Kcur", il);
9180
9214
 
9181
9215
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9182
9216
  model.layers[il].wo, model.layers[il].bo,
9183
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9217
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9184
9218
  }
9185
9219
 
9186
9220
  if (il == n_layer - 1) {
@@ -9249,6 +9283,9 @@ struct llm_build_context {
9249
9283
 
9250
9284
  // self-attention
9251
9285
  {
9286
+ // rope freq factors for 128k context
9287
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9288
+
9252
9289
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9253
9290
  model.layers[il].attn_norm,
9254
9291
  NULL,
@@ -9280,8 +9317,8 @@ struct llm_build_context {
9280
9317
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9281
9318
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9282
9319
 
9283
- Qcur = ggml_rope_custom(
9284
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9320
+ Qcur = ggml_rope_ext(
9321
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9285
9322
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9286
9323
  );
9287
9324
  cb(Qcur, "Qcur", il);
@@ -9289,15 +9326,15 @@ struct llm_build_context {
9289
9326
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9290
9327
  cb(Qcur, "Qcur", il);
9291
9328
 
9292
- Kcur = ggml_rope_custom(
9293
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9329
+ Kcur = ggml_rope_ext(
9330
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9294
9331
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9295
9332
  );
9296
9333
  cb(Kcur, "Kcur", il);
9297
9334
 
9298
9335
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
9336
  model.layers[il].wo, model.layers[il].bo,
9300
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9337
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
9338
  }
9302
9339
 
9303
9340
  if (il == n_layer - 1) {
@@ -9396,21 +9433,21 @@ struct llm_build_context {
9396
9433
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9397
9434
  cb(Vcur, "Vcur", il);
9398
9435
 
9399
- Qcur = ggml_rope_custom(
9400
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9436
+ Qcur = ggml_rope_ext(
9437
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9401
9438
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9402
9439
  ext_factor, attn_factor, beta_fast, beta_slow);
9403
9440
  cb(Qcur, "Qcur", il);
9404
9441
 
9405
- Kcur = ggml_rope_custom(
9406
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9442
+ Kcur = ggml_rope_ext(
9443
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9407
9444
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9408
9445
  ext_factor, attn_factor, beta_fast, beta_slow);
9409
9446
  cb(Kcur, "Kcur", il);
9410
9447
 
9411
9448
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9412
9449
  model.layers[il].wo, NULL,
9413
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9450
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9414
9451
  }
9415
9452
  struct ggml_tensor * sa_out = cur;
9416
9453
 
@@ -9513,7 +9550,7 @@ struct llm_build_context {
9513
9550
 
9514
9551
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9515
9552
  model.layers[il].wo, model.layers[il].bo,
9516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9553
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9517
9554
  }
9518
9555
 
9519
9556
  if (il == n_layer - 1) {
@@ -9604,15 +9641,15 @@ struct llm_build_context {
9604
9641
  cb(tmpk, "tmpk", il);
9605
9642
  cb(Vcur, "Vcur", il);
9606
9643
 
9607
- struct ggml_tensor * Qcur = ggml_rope_custom(
9608
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9644
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9645
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9609
9646
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9610
9647
  ext_factor, attn_factor, beta_fast, beta_slow
9611
9648
  );
9612
9649
  cb(Qcur, "Qcur", il);
9613
9650
 
9614
- struct ggml_tensor * Kcur = ggml_rope_custom(
9615
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9651
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9652
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9616
9653
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9617
9654
  ext_factor, attn_factor, beta_fast, beta_slow
9618
9655
  );
@@ -9620,7 +9657,7 @@ struct llm_build_context {
9620
9657
 
9621
9658
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9622
9659
  model.layers[il].wo, model.layers[il].bo,
9623
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9660
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9624
9661
  }
9625
9662
 
9626
9663
  if (il == n_layer - 1) {
@@ -9720,15 +9757,15 @@ struct llm_build_context {
9720
9757
  // cb(Vcur, "Vcur", il);
9721
9758
  // }
9722
9759
 
9723
- Qcur = ggml_rope_custom(
9724
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9760
+ Qcur = ggml_rope_ext(
9761
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9725
9762
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9726
9763
  ext_factor, attn_factor, beta_fast, beta_slow
9727
9764
  );
9728
9765
  cb(Qcur, "Qcur", il);
9729
9766
 
9730
- Kcur = ggml_rope_custom(
9731
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9767
+ Kcur = ggml_rope_ext(
9768
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9732
9769
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9733
9770
  ext_factor, attn_factor, beta_fast, beta_slow
9734
9771
  );
@@ -9736,7 +9773,7 @@ struct llm_build_context {
9736
9773
 
9737
9774
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9738
9775
  model.layers[il].wo, NULL,
9739
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9776
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9740
9777
  }
9741
9778
 
9742
9779
  if (il == n_layer - 1) {
@@ -9837,15 +9874,15 @@ struct llm_build_context {
9837
9874
  cb(Vcur, "Vcur", il);
9838
9875
  }
9839
9876
 
9840
- Qcur = ggml_rope_custom(
9841
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9877
+ Qcur = ggml_rope_ext(
9878
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9842
9879
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9843
9880
  ext_factor, attn_factor, beta_fast, beta_slow
9844
9881
  );
9845
9882
  cb(Qcur, "Qcur", il);
9846
9883
 
9847
- Kcur = ggml_rope_custom(
9848
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9884
+ Kcur = ggml_rope_ext(
9885
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9849
9886
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9850
9887
  ext_factor, attn_factor, beta_fast, beta_slow
9851
9888
  );
@@ -9853,7 +9890,7 @@ struct llm_build_context {
9853
9890
 
9854
9891
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9855
9892
  model.layers[il].wo, model.layers[il].bo,
9856
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9893
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9857
9894
  }
9858
9895
 
9859
9896
  if (il == n_layer - 1) {
@@ -9967,15 +10004,15 @@ struct llm_build_context {
9967
10004
  cb(Vcur, "Vcur", il);
9968
10005
  }
9969
10006
 
9970
- Qcur = ggml_rope_custom(
9971
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10007
+ Qcur = ggml_rope_ext(
10008
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9972
10009
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9973
10010
  ext_factor, attn_factor, beta_fast, beta_slow
9974
10011
  );
9975
10012
  cb(Qcur, "Qcur", il);
9976
10013
 
9977
- Kcur = ggml_rope_custom(
9978
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10014
+ Kcur = ggml_rope_ext(
10015
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9979
10016
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9980
10017
  ext_factor, attn_factor, beta_fast, beta_slow
9981
10018
  );
@@ -9983,7 +10020,7 @@ struct llm_build_context {
9983
10020
 
9984
10021
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9985
10022
  model.layers[il].wo, model.layers[il].bo,
9986
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10023
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9987
10024
  }
9988
10025
 
9989
10026
  if (il == n_layer - 1) {
@@ -10087,8 +10124,8 @@ struct llm_build_context {
10087
10124
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10088
10125
  cb(Vcur, "Vcur", il);
10089
10126
 
10090
- Qcur = ggml_rope_custom(
10091
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10127
+ Qcur = ggml_rope_ext(
10128
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10092
10129
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10093
10130
  ext_factor, attn_factor, beta_fast, beta_slow);
10094
10131
  cb(Qcur, "Qcur", il);
@@ -10096,15 +10133,15 @@ struct llm_build_context {
10096
10133
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10097
10134
  cb(Qcur, "Qcur_scaled", il);
10098
10135
 
10099
- Kcur = ggml_rope_custom(
10100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10136
+ Kcur = ggml_rope_ext(
10137
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10101
10138
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10102
10139
  ext_factor, attn_factor, beta_fast, beta_slow);
10103
10140
  cb(Kcur, "Kcur", il);
10104
10141
 
10105
10142
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10106
10143
  model.layers[il].wo, NULL,
10107
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10144
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10108
10145
  }
10109
10146
 
10110
10147
  if (il == n_layer - 1) {
@@ -10207,15 +10244,15 @@ struct llm_build_context {
10207
10244
  cb(Vcur, "Vcur", il);
10208
10245
  }
10209
10246
 
10210
- Qcur = ggml_rope_custom(
10211
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10247
+ Qcur = ggml_rope_ext(
10248
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10212
10249
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10213
10250
  ext_factor, attn_factor, beta_fast, beta_slow
10214
10251
  );
10215
10252
  cb(Qcur, "Qcur", il);
10216
10253
 
10217
- Kcur = ggml_rope_custom(
10218
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10254
+ Kcur = ggml_rope_ext(
10255
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10219
10256
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10220
10257
  ext_factor, attn_factor, beta_fast, beta_slow
10221
10258
  );
@@ -10223,7 +10260,7 @@ struct llm_build_context {
10223
10260
 
10224
10261
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10225
10262
  model.layers[il].wo, model.layers[il].bo,
10226
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10263
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10227
10264
  }
10228
10265
 
10229
10266
  if (il == n_layer - 1) {
@@ -10490,22 +10527,267 @@ struct llm_build_context {
10490
10527
  LLM_NORM, cb, il);
10491
10528
  cb(Qcur, "Qcur", il);
10492
10529
 
10493
- Kcur = llm_build_norm(ctx0, Kcur, hparams,
10494
- model.layers[il].attn_k_norm,
10495
- NULL,
10496
- LLM_NORM, cb, il);
10497
- cb(Kcur, "Kcur", il);
10498
- }
10530
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
10531
+ model.layers[il].attn_k_norm,
10532
+ NULL,
10533
+ LLM_NORM, cb, il);
10534
+ cb(Kcur, "Kcur", il);
10535
+ }
10536
+
10537
+ Qcur = ggml_rope_ext(
10538
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10539
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10540
+ ext_factor, attn_factor, beta_fast, beta_slow
10541
+ );
10542
+ cb(Qcur, "Qcur", il);
10543
+
10544
+ Kcur = ggml_rope_ext(
10545
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10546
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10547
+ ext_factor, attn_factor, beta_fast, beta_slow
10548
+ );
10549
+ cb(Kcur, "Kcur", il);
10550
+
10551
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10552
+ model.layers[il].wo, model.layers[il].bo,
10553
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10554
+ }
10555
+
10556
+ if (il == n_layer - 1) {
10557
+ // skip computing output for unused tokens
10558
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10559
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10560
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10561
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
10562
+ }
10563
+
10564
+ struct ggml_tensor * attn_out = cur;
10565
+
10566
+ // feed-forward network
10567
+ {
10568
+ cur = llm_build_ffn(ctx0, ffn_inp,
10569
+ model.layers[il].ffn_up, NULL,
10570
+ model.layers[il].ffn_gate, NULL,
10571
+ model.layers[il].ffn_down, NULL,
10572
+ NULL,
10573
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10574
+ cb(cur, "ffn_out", il);
10575
+ }
10576
+
10577
+ // add together residual + FFN + self-attention
10578
+ cur = ggml_add(ctx0, cur, inpL);
10579
+ cur = ggml_add(ctx0, cur, attn_out);
10580
+ cb(cur, "l_out", il);
10581
+
10582
+ // input for next layer
10583
+ inpL = cur;
10584
+ }
10585
+
10586
+ cur = inpL;
10587
+
10588
+ cur = llm_build_norm(ctx0, cur, hparams,
10589
+ model.output_norm, NULL,
10590
+ LLM_NORM, cb, -1);
10591
+ cb(cur, "result_norm", -1);
10592
+
10593
+ // lm_head
10594
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10595
+
10596
+ if (f_logit_scale) {
10597
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
10598
+ }
10599
+
10600
+ cb(cur, "result_output", -1);
10601
+
10602
+ ggml_build_forward_expand(gf, cur);
10603
+
10604
+ return gf;
10605
+
10606
+ }
10607
+
10608
+ // ref: https://allenai.org/olmo
10609
+ // based on the original build_llama() function, changes:
10610
+ // * non-parametric layer norm
10611
+ // * clamp qkv
10612
+ // * removed bias
10613
+ // * removed MoE
10614
+ struct ggml_cgraph * build_olmo() {
10615
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10616
+
10617
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10618
+ int32_t n_tokens = this->n_tokens;
10619
+
10620
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10621
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10622
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10623
+
10624
+ struct ggml_tensor * cur;
10625
+ struct ggml_tensor * inpL;
10626
+
10627
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10628
+
10629
+ // inp_pos - contains the positions
10630
+ struct ggml_tensor * inp_pos = build_inp_pos();
10631
+
10632
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10633
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10634
+
10635
+ for (int il = 0; il < n_layer; ++il) {
10636
+ struct ggml_tensor * inpSA = inpL;
10637
+
10638
+ // norm
10639
+ cur = llm_build_norm(ctx0, inpL, hparams,
10640
+ NULL, NULL,
10641
+ LLM_NORM, cb, il);
10642
+ cb(cur, "attn_norm", il);
10643
+
10644
+ // self-attention
10645
+ {
10646
+ // compute Q and K and RoPE them
10647
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10648
+ cb(Qcur, "Qcur", il);
10649
+ if (hparams.f_clamp_kqv > 0.0f) {
10650
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10651
+ cb(Qcur, "Qcur", il);
10652
+ }
10653
+
10654
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10655
+ cb(Kcur, "Kcur", il);
10656
+ if (hparams.f_clamp_kqv > 0.0f) {
10657
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10658
+ cb(Kcur, "Kcur", il);
10659
+ }
10660
+
10661
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10662
+ cb(Vcur, "Vcur", il);
10663
+ if (hparams.f_clamp_kqv > 0.0f) {
10664
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10665
+ cb(Vcur, "Vcur", il);
10666
+ }
10667
+
10668
+ Qcur = ggml_rope_ext(
10669
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10670
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10671
+ ext_factor, attn_factor, beta_fast, beta_slow
10672
+ );
10673
+ cb(Qcur, "Qcur", il);
10674
+
10675
+ Kcur = ggml_rope_ext(
10676
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10677
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10678
+ ext_factor, attn_factor, beta_fast, beta_slow
10679
+ );
10680
+ cb(Kcur, "Kcur", il);
10681
+
10682
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10683
+ model.layers[il].wo, nullptr,
10684
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10685
+ }
10686
+
10687
+ if (il == n_layer - 1) {
10688
+ // skip computing output for unused tokens
10689
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10690
+ n_tokens = n_outputs;
10691
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10692
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10693
+ }
10694
+
10695
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10696
+ cb(ffn_inp, "ffn_inp", il);
10697
+
10698
+ // feed-forward network
10699
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10700
+ NULL, NULL,
10701
+ LLM_NORM, cb, il);
10702
+ cb(cur, "ffn_norm", il);
10703
+
10704
+ cur = llm_build_ffn(ctx0, cur,
10705
+ model.layers[il].ffn_up, NULL,
10706
+ model.layers[il].ffn_gate, NULL,
10707
+ model.layers[il].ffn_down, NULL,
10708
+ NULL,
10709
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10710
+ cb(cur, "ffn_out", il);
10711
+
10712
+ cur = ggml_add(ctx0, cur, ffn_inp);
10713
+ cb(cur, "ffn_out", il);
10714
+
10715
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10716
+ if (layer_dir != nullptr) {
10717
+ cur = ggml_add(ctx0, cur, layer_dir);
10718
+ }
10719
+ cb(cur, "l_out", il);
10720
+
10721
+ // input for next layer
10722
+ inpL = cur;
10723
+ }
10724
+
10725
+ cur = inpL;
10726
+
10727
+ cur = llm_build_norm(ctx0, cur, hparams,
10728
+ NULL, NULL,
10729
+ LLM_NORM, cb, -1);
10730
+ cb(cur, "result_norm", -1);
10731
+
10732
+ // lm_head
10733
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10734
+ cb(cur, "result_output", -1);
10735
+
10736
+ ggml_build_forward_expand(gf, cur);
10737
+
10738
+ return gf;
10739
+ }
10740
+
10741
+ struct ggml_cgraph * build_gptneox() {
10742
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10743
+
10744
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10745
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10746
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10747
+
10748
+ struct ggml_tensor * cur;
10749
+ struct ggml_tensor * inpL;
10750
+
10751
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10752
+
10753
+ // inp_pos - contains the positions
10754
+ struct ggml_tensor * inp_pos = build_inp_pos();
10755
+
10756
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10757
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10758
+
10759
+ for (int il = 0; il < n_layer; ++il) {
10760
+ cur = llm_build_norm(ctx0, inpL, hparams,
10761
+ model.layers[il].attn_norm,
10762
+ model.layers[il].attn_norm_b,
10763
+ LLM_NORM, cb, il);
10764
+ cb(cur, "attn_norm", il);
10765
+
10766
+ // self-attention
10767
+ {
10768
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10769
+ cb(cur, "wqkv", il);
10770
+
10771
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10772
+ cb(cur, "bqkv", il);
10773
+
10774
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10775
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10776
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10777
+
10778
+ cb(Qcur, "Qcur", il);
10779
+ cb(Kcur, "Kcur", il);
10780
+ cb(Vcur, "Vcur", il);
10499
10781
 
10500
- Qcur = ggml_rope_custom(
10501
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10782
+ Qcur = ggml_rope_ext(
10783
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10502
10784
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10503
10785
  ext_factor, attn_factor, beta_fast, beta_slow
10504
10786
  );
10505
10787
  cb(Qcur, "Qcur", il);
10506
10788
 
10507
- Kcur = ggml_rope_custom(
10508
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10789
+ Kcur = ggml_rope_ext(
10790
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10509
10791
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10510
10792
  ext_factor, attn_factor, beta_fast, beta_slow
10511
10793
  );
@@ -10513,68 +10795,84 @@ struct llm_build_context {
10513
10795
 
10514
10796
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10515
10797
  model.layers[il].wo, model.layers[il].bo,
10516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10798
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10517
10799
  }
10518
10800
 
10519
10801
  if (il == n_layer - 1) {
10520
10802
  // skip computing output for unused tokens
10521
10803
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10522
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10523
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10524
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
10804
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10805
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10525
10806
  }
10526
10807
 
10527
- struct ggml_tensor * attn_out = cur;
10808
+ // ffn
10809
+ if (hparams.use_par_res) {
10810
+ // attention and ffn are computed in parallel
10811
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10528
10812
 
10529
- // feed-forward network
10530
- {
10531
- cur = llm_build_ffn(ctx0, ffn_inp,
10532
- model.layers[il].ffn_up, NULL,
10533
- model.layers[il].ffn_gate, NULL,
10534
- model.layers[il].ffn_down, NULL,
10813
+ struct ggml_tensor * attn_out = cur;
10814
+
10815
+ cur = llm_build_norm(ctx0, inpL, hparams,
10816
+ model.layers[il].ffn_norm,
10817
+ model.layers[il].ffn_norm_b,
10818
+ LLM_NORM, cb, il);
10819
+ cb(cur, "ffn_norm", il);
10820
+
10821
+ cur = llm_build_ffn(ctx0, cur,
10822
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10823
+ NULL, NULL,
10824
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10535
10825
  NULL,
10536
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10826
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10537
10827
  cb(cur, "ffn_out", il);
10538
- }
10539
10828
 
10540
- // add together residual + FFN + self-attention
10541
- cur = ggml_add(ctx0, cur, inpL);
10542
- cur = ggml_add(ctx0, cur, attn_out);
10543
- cb(cur, "l_out", il);
10829
+ cur = ggml_add(ctx0, cur, inpL);
10830
+ cb(cur, "ffn_out", il);
10544
10831
 
10545
- // input for next layer
10546
- inpL = cur;
10547
- }
10832
+ inpL = ggml_add(ctx0, cur, attn_out);
10833
+ cb(inpL, "l_out", il);
10834
+ } else {
10835
+ // attention and ffn are computed sequentially
10836
+ // x = x + attn(ln1(x))
10837
+ // x = x + ffn(ln2(x))
10548
10838
 
10549
- cur = inpL;
10839
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10840
+ cb(ffn_inp, "ffn_inp", il);
10550
10841
 
10551
- cur = llm_build_norm(ctx0, cur, hparams,
10552
- model.output_norm, NULL,
10553
- LLM_NORM, cb, -1);
10554
- cb(cur, "result_norm", -1);
10842
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10843
+ model.layers[il].ffn_norm,
10844
+ model.layers[il].ffn_norm_b,
10845
+ LLM_NORM, cb, il);
10846
+ cb(cur, "ffn_norm", il);
10555
10847
 
10556
- // lm_head
10557
- cur = ggml_mul_mat(ctx0, model.output, cur);
10848
+ cur = llm_build_ffn(ctx0, cur,
10849
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10850
+ NULL, NULL,
10851
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10852
+ NULL,
10853
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10854
+ cb(cur, "ffn_out", il);
10558
10855
 
10559
- if (f_logit_scale) {
10560
- cur = ggml_scale(ctx0, cur, f_logit_scale);
10856
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10857
+ cb(inpL, "l_out", il);
10858
+ }
10561
10859
  }
10562
10860
 
10861
+ cur = llm_build_norm(ctx0, inpL, hparams,
10862
+ model.output_norm,
10863
+ model.output_norm_b,
10864
+ LLM_NORM, cb, -1);
10865
+ cb(cur, "result_norm", -1);
10866
+
10867
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10563
10868
  cb(cur, "result_output", -1);
10564
10869
 
10565
10870
  ggml_build_forward_expand(gf, cur);
10566
10871
 
10567
10872
  return gf;
10568
-
10569
10873
  }
10570
10874
 
10571
- // ref: https://allenai.org/olmo
10572
- // based on the original build_llama() function, changes:
10573
- // * non-parametric layer norm
10574
- // * clamp qkv
10575
- // * removed bias
10576
- // * removed MoE
10577
- struct ggml_cgraph * build_olmo() {
10875
+ struct ggml_cgraph * build_arctic() {
10578
10876
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10579
10877
 
10580
10878
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -10600,8 +10898,8 @@ struct llm_build_context {
10600
10898
 
10601
10899
  // norm
10602
10900
  cur = llm_build_norm(ctx0, inpL, hparams,
10603
- NULL, NULL,
10604
- LLM_NORM, cb, il);
10901
+ model.layers[il].attn_norm, NULL,
10902
+ LLM_NORM_RMS, cb, il);
10605
10903
  cb(cur, "attn_norm", il);
10606
10904
 
10607
10905
  // self-attention
@@ -10609,42 +10907,30 @@ struct llm_build_context {
10609
10907
  // compute Q and K and RoPE them
10610
10908
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10611
10909
  cb(Qcur, "Qcur", il);
10612
- if (hparams.f_clamp_kqv > 0.0f) {
10613
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10614
- cb(Qcur, "Qcur", il);
10615
- }
10616
10910
 
10617
10911
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10618
10912
  cb(Kcur, "Kcur", il);
10619
- if (hparams.f_clamp_kqv > 0.0f) {
10620
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10621
- cb(Kcur, "Kcur", il);
10622
- }
10623
10913
 
10624
10914
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10625
10915
  cb(Vcur, "Vcur", il);
10626
- if (hparams.f_clamp_kqv > 0.0f) {
10627
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10628
- cb(Vcur, "Vcur", il);
10629
- }
10630
10916
 
10631
- Qcur = ggml_rope_custom(
10632
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10917
+ Qcur = ggml_rope_ext(
10918
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10633
10919
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10634
10920
  ext_factor, attn_factor, beta_fast, beta_slow
10635
10921
  );
10636
10922
  cb(Qcur, "Qcur", il);
10637
10923
 
10638
- Kcur = ggml_rope_custom(
10639
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10924
+ Kcur = ggml_rope_ext(
10925
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10640
10926
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10641
10927
  ext_factor, attn_factor, beta_fast, beta_slow
10642
10928
  );
10643
10929
  cb(Kcur, "Kcur", il);
10644
10930
 
10645
10931
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10646
- model.layers[il].wo, nullptr,
10647
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10932
+ model.layers[il].wo, NULL,
10933
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10648
10934
  }
10649
10935
 
10650
10936
  if (il == n_layer - 1) {
@@ -10660,8 +10946,8 @@ struct llm_build_context {
10660
10946
 
10661
10947
  // feed-forward network
10662
10948
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
10663
- NULL, NULL,
10664
- LLM_NORM, cb, il);
10949
+ model.layers[il].ffn_norm, NULL,
10950
+ LLM_NORM_RMS, cb, il);
10665
10951
  cb(cur, "ffn_norm", il);
10666
10952
 
10667
10953
  cur = llm_build_ffn(ctx0, cur,
@@ -10672,7 +10958,26 @@ struct llm_build_context {
10672
10958
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10673
10959
  cb(cur, "ffn_out", il);
10674
10960
 
10675
- cur = ggml_add(ctx0, cur, ffn_inp);
10961
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
10962
+ cb(ffn_out, "ffn_out", il);
10963
+
10964
+ // MoE
10965
+ cur = llm_build_norm(ctx0, inpSA, hparams,
10966
+ model.layers[il].ffn_norm_exps, NULL,
10967
+ LLM_NORM_RMS, cb, il);
10968
+ cb(cur, "ffn_norm_exps", il);
10969
+
10970
+ cur = llm_build_moe_ffn(ctx0, cur,
10971
+ model.layers[il].ffn_gate_inp,
10972
+ model.layers[il].ffn_up_exps,
10973
+ model.layers[il].ffn_gate_exps,
10974
+ model.layers[il].ffn_down_exps,
10975
+ n_expert, n_expert_used,
10976
+ LLM_FFN_SILU, true,
10977
+ cb, il);
10978
+ cb(cur, "ffn_moe_out", il);
10979
+
10980
+ cur = ggml_add(ctx0, cur, ffn_out);
10676
10981
  cb(cur, "ffn_out", il);
10677
10982
 
10678
10983
  ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
@@ -10688,8 +10993,8 @@ struct llm_build_context {
10688
10993
  cur = inpL;
10689
10994
 
10690
10995
  cur = llm_build_norm(ctx0, cur, hparams,
10691
- NULL, NULL,
10692
- LLM_NORM, cb, -1);
10996
+ model.output_norm, NULL,
10997
+ LLM_NORM_RMS, cb, -1);
10693
10998
  cb(cur, "result_norm", -1);
10694
10999
 
10695
11000
  // lm_head
@@ -10816,15 +11121,12 @@ static struct ggml_cgraph * llama_build_graph(
10816
11121
  {
10817
11122
  result = llm.build_starcoder();
10818
11123
  } break;
10819
- case LLM_ARCH_PERSIMMON:
10820
- {
10821
- result = llm.build_persimmon();
10822
- } break;
10823
11124
  case LLM_ARCH_REFACT:
10824
11125
  {
10825
11126
  result = llm.build_refact();
10826
11127
  } break;
10827
11128
  case LLM_ARCH_BERT:
11129
+ case LLM_ARCH_JINA_BERT_V2:
10828
11130
  case LLM_ARCH_NOMIC_BERT:
10829
11131
  {
10830
11132
  result = llm.build_bert();
@@ -10913,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
10913
11215
  {
10914
11216
  result = llm.build_olmo();
10915
11217
  } break;
11218
+ case LLM_ARCH_GPTNEOX:
11219
+ {
11220
+ result = llm.build_gptneox();
11221
+ } break;
11222
+ case LLM_ARCH_ARCTIC:
11223
+ {
11224
+ result = llm.build_arctic();
11225
+ } break;
10916
11226
  default:
10917
11227
  GGML_ASSERT(false);
10918
11228
  }
@@ -11032,11 +11342,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11032
11342
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11033
11343
  f = -INFINITY;
11034
11344
  } else {
11035
- f = 0.0f;
11345
+ if (hparams.use_alibi) {
11346
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11347
+ } else {
11348
+ f = 0.0f;
11349
+ }
11036
11350
  }
11037
11351
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11038
11352
  }
11039
11353
  }
11354
+
11355
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11356
+ for (int j = 0; j < n_kv; ++j) {
11357
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11358
+ }
11359
+ }
11040
11360
  }
11041
11361
  } else {
11042
11362
  // when using kv cache, the mask needs to match the kv cache size
@@ -11055,7 +11375,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11055
11375
  float f = -INFINITY;
11056
11376
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11057
11377
  if (batch.seq_id[i][s] == seq_id) {
11058
- f = 0.0f;
11378
+ if (hparams.use_alibi) {
11379
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11380
+ } else {
11381
+ f = 0.0f;
11382
+ }
11059
11383
  break;
11060
11384
  }
11061
11385
  }
@@ -11071,21 +11395,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11071
11395
  }
11072
11396
  }
11073
11397
 
11074
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
- // this allows to process multiple sequences in parallel with ALiBi-based models
11076
- if (hparams.use_alibi) {
11077
- const int64_t n_kv = kv_self.n;
11078
-
11079
- GGML_ASSERT(lctx.inp_KQ_pos);
11080
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11081
-
11082
- float * data = (float *) lctx.inp_KQ_pos->data;
11083
-
11084
- for (int i = 0; i < n_kv; ++i) {
11085
- data[i] = float(lctx.kv_self.cells[i].pos);
11086
- }
11087
- }
11088
-
11089
11398
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11090
11399
  const int64_t n_tokens = batch.n_tokens;
11091
11400
 
@@ -11259,11 +11568,6 @@ static void llama_graph_compute(
11259
11568
  llama_context & lctx,
11260
11569
  ggml_cgraph * gf,
11261
11570
  int n_threads) {
11262
- #ifdef GGML_USE_MPI
11263
- const int64_t n_layer = lctx.model.hparams.n_layer;
11264
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11265
- #endif
11266
-
11267
11571
  #ifdef GGML_USE_METAL
11268
11572
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11269
11573
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11278,10 +11582,6 @@ static void llama_graph_compute(
11278
11582
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11279
11583
 
11280
11584
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11281
-
11282
- #ifdef GGML_USE_MPI
11283
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11284
- #endif
11285
11585
  }
11286
11586
 
11287
11587
  // decode a batch of tokens by evaluating the transformer
@@ -11319,12 +11619,6 @@ static int llama_decode_internal(
11319
11619
  }
11320
11620
  lctx.n_queued_tokens += n_tokens_all;
11321
11621
 
11322
- #ifdef GGML_USE_MPI
11323
- // TODO: needs fix after #3228
11324
- GGML_ASSERT(false && "not implemented");
11325
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11326
- #endif
11327
-
11328
11622
  auto & kv_self = lctx.kv_self;
11329
11623
 
11330
11624
  const int64_t n_embd = hparams.n_embd;
@@ -11455,7 +11749,8 @@ static int llama_decode_internal(
11455
11749
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11456
11750
  // after enough generations, the benefit from this heuristic disappears
11457
11751
  // if we start defragmenting the cache, the benefit from this will be more important
11458
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11752
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11753
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11459
11754
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11460
11755
  }
11461
11756
  }
@@ -12200,13 +12495,14 @@ struct llm_tokenizer_bpe {
12200
12495
 
12201
12496
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12202
12497
  int final_prev_index = -1;
12498
+ bool ignore_merges = false;
12203
12499
 
12204
12500
  std::vector<std::string> word_collection;
12205
12501
  switch (vocab.type) {
12206
12502
  case LLAMA_VOCAB_TYPE_BPE:
12207
12503
  switch (vocab.type_pre) {
12208
12504
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12505
+ ignore_merges = true;
12210
12506
  word_collection = unicode_regex_split(text, {
12211
12507
  // original regex from tokenizer.json
12212
12508
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12215,6 +12511,12 @@ struct llm_tokenizer_bpe {
12215
12511
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
12512
  });
12217
12513
  break;
12514
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12515
+ word_collection = unicode_regex_split(text, {
12516
+ // same as llama3
12517
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12518
+ });
12519
+ break;
12218
12520
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
12521
  word_collection = unicode_regex_split(text, {
12220
12522
  "[\r\n]",
@@ -12266,6 +12568,7 @@ struct llm_tokenizer_bpe {
12266
12568
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12267
12569
  });
12268
12570
  break;
12571
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12269
12572
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
12573
  word_collection = unicode_regex_split(text, {
12271
12574
  // original regex from tokenizer.json
@@ -12298,6 +12601,11 @@ struct llm_tokenizer_bpe {
12298
12601
  int index = 0;
12299
12602
  size_t offset = 0;
12300
12603
 
12604
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12605
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12606
+ offset = word.size();
12607
+ }
12608
+
12301
12609
  while (offset < word.size()) {
12302
12610
  llm_symbol sym;
12303
12611
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12483,16 +12791,16 @@ struct llm_tokenizer_wpm {
12483
12791
  // to lowercase, pad chinese characters, pad punctuation
12484
12792
  std::string new_str = "";
12485
12793
  for (uint32_t code : cpts_nfd) {
12486
- int type = unicode_cpt_type(code);
12487
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12794
+ const codepoint_flags flags = unicode_cpt_flags(code);
12795
+ if (flags.is_accent_mark || flags.is_control) {
12488
12796
  continue;
12489
12797
  }
12490
12798
  code = unicode_tolower(code);
12491
- if (type == CODEPOINT_TYPE_SEPARATOR) {
12799
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12492
12800
  code = ' ';
12493
12801
  }
12494
12802
  std::string s = unicode_cpt_to_utf8(code);
12495
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12803
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12496
12804
  new_str += " ";
12497
12805
  new_str += s;
12498
12806
  new_str += " ";
@@ -12695,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12695
13003
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12696
13004
  // tokenizer.encode('', add_special_tokens=False) returns []
12697
13005
 
13006
+ static const bool rtrim = true; //TODO: as param
13007
+ bool is_prev_special = false;
13008
+ bool special_token_rtrim = false;
13009
+
12698
13010
  if (add_special && vocab.special_add_bos != 0) {
12699
13011
  GGML_ASSERT(vocab.special_bos_id != -1);
12700
13012
  output.push_back(vocab.special_bos_id);
13013
+ is_prev_special = true;
12701
13014
  }
12702
13015
 
12703
13016
  for (const auto & fragment : fragment_buffer) {
@@ -12709,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12709
13022
  // and passing 'add space prefix' as bool argument
12710
13023
  //
12711
13024
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12712
- if (&fragment == &fragment_buffer.front()) {
12713
- if (vocab.add_space_prefix) {
12714
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13025
+
13026
+ if (special_token_rtrim) {
13027
+ size_t num_whitespaces = 0;
13028
+ while (isspace(raw_text[num_whitespaces])) {
13029
+ num_whitespaces++;
13030
+ }
13031
+ if (num_whitespaces == raw_text.size()) {
13032
+ continue; // skip if all whitespaces
13033
+ }
13034
+ raw_text = raw_text.substr(num_whitespaces);
13035
+ }
13036
+
13037
+ if (vocab.add_space_prefix) {
13038
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13039
+ raw_text = " " + raw_text;
12715
13040
  }
12716
13041
  }
12717
13042
 
@@ -12723,9 +13048,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12723
13048
  tokenizer.tokenize(raw_text, output);
12724
13049
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12725
13050
  output.push_back(fragment.token);
13051
+ is_prev_special = true;
13052
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13053
+ special_token_rtrim = rtrim
13054
+ && fragment.token != vocab.special_bos_id
13055
+ && fragment.token != vocab.special_unk_id
13056
+ && fragment.token != vocab.special_eos_id;
12726
13057
  }
12727
13058
  }
12728
13059
 
13060
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13061
+ LLAMA_LOG_WARN(
13062
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13063
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13064
+ "Are you sure this is what you want?\n", __FUNCTION__);
13065
+ }
13066
+
12729
13067
  if (add_special && vocab.special_add_eos == 1) {
12730
13068
  GGML_ASSERT(vocab.special_eos_id != -1);
12731
13069
  output.push_back(vocab.special_eos_id);
@@ -12752,7 +13090,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12752
13090
  }
12753
13091
  }
12754
13092
 
12755
- GGML_ASSERT(vocab.special_add_eos != 1);
13093
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13094
+ LLAMA_LOG_WARN(
13095
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13096
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13097
+ "Are you sure this is what you want?\n", __FUNCTION__);
13098
+ }
13099
+
13100
+ if (add_special && vocab.special_add_eos == 1) {
13101
+ GGML_ASSERT(vocab.special_add_eos != -1);
13102
+ output.push_back(vocab.special_eos_id);
13103
+ }
12756
13104
  } break;
12757
13105
  case LLAMA_VOCAB_TYPE_WPM:
12758
13106
  {
@@ -13106,6 +13454,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13106
13454
  return rejects;
13107
13455
  }
13108
13456
 
13457
+ static bool llama_grammar_detect_left_recursion(
13458
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13459
+ size_t rule_index,
13460
+ std::vector<bool> * rules_visited,
13461
+ std::vector<bool> * rules_in_progress,
13462
+ std::vector<bool> * rules_may_be_empty) {
13463
+ if ((*rules_in_progress)[rule_index]) {
13464
+ return true;
13465
+ }
13466
+
13467
+ (*rules_in_progress)[rule_index] = true;
13468
+
13469
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13470
+
13471
+ // First check if the rule might produce the empty string. This could be done combined with the second
13472
+ // step but it's more readable as two steps.
13473
+ bool at_rule_start = true;
13474
+ for (size_t i = 0; i < rule.size(); i++) {
13475
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13476
+ if (at_rule_start) {
13477
+ (*rules_may_be_empty)[rule_index] = true;
13478
+ break;
13479
+ }
13480
+ at_rule_start = true;
13481
+ } else {
13482
+ at_rule_start = false;
13483
+ }
13484
+ }
13485
+
13486
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13487
+ // be empty)
13488
+ bool recurse_into_nonterminal = true;
13489
+ for (size_t i = 0; i < rule.size(); i++) {
13490
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13491
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13492
+ return true;
13493
+ }
13494
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13495
+ recurse_into_nonterminal = false;
13496
+ }
13497
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13498
+ recurse_into_nonterminal = true;
13499
+ } else {
13500
+ recurse_into_nonterminal = false;
13501
+ }
13502
+ }
13503
+
13504
+ (*rules_in_progress)[rule_index] = false;
13505
+ (*rules_visited)[rule_index] = true;
13506
+ return false;
13507
+ }
13508
+
13109
13509
  //
13110
13510
  // grammar - external
13111
13511
  //
@@ -13125,6 +13525,19 @@ struct llama_grammar * llama_grammar_init(
13125
13525
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13126
13526
  }
13127
13527
 
13528
+ // Check for left recursion
13529
+ std::vector<bool> rules_visited(n_rules);
13530
+ std::vector<bool> rules_in_progress(n_rules);
13531
+ std::vector<bool> rules_may_be_empty(n_rules);
13532
+ for (size_t i = 0; i < n_rules; i++) {
13533
+ if (rules_visited[i]) {
13534
+ continue;
13535
+ }
13536
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13537
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13538
+ }
13539
+ }
13540
+
13128
13541
  // loop over alternates of start rule to build initial stacks
13129
13542
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13130
13543
  pos = vec_rules[start_rule_index].data();
@@ -13147,6 +13560,9 @@ struct llama_grammar * llama_grammar_init(
13147
13560
  }
13148
13561
  } while (true);
13149
13562
 
13563
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13564
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13565
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13150
13566
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13151
13567
  }
13152
13568
 
@@ -13741,9 +14157,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13741
14157
 
13742
14158
  // Sample the next word X using top-k sampling
13743
14159
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13744
- if (ctx) {
13745
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13746
- }
14160
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13747
14161
  llama_token X = llama_sample_token(ctx, candidates);
13748
14162
  t_start_sample_us = ggml_time_us();
13749
14163
 
@@ -13757,9 +14171,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13757
14171
  // Update mu using the learning rate and error
13758
14172
  *mu = *mu - eta * e;
13759
14173
 
13760
- if (ctx) {
13761
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13762
- }
14174
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13763
14175
  return X;
13764
14176
  }
13765
14177
 
@@ -14344,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14344
14756
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14345
14757
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14346
14758
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14347
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14348
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14349
14759
  if (qs.model.type == MODEL_70B) {
14350
14760
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14351
14761
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15246,6 +15656,7 @@ struct llama_model_params llama_model_default_params() {
15246
15656
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15247
15657
  /*.main_gpu =*/ 0,
15248
15658
  /*.tensor_split =*/ nullptr,
15659
+ /*.rpc_servers =*/ nullptr,
15249
15660
  /*.progress_callback =*/ nullptr,
15250
15661
  /*.progress_callback_user_data =*/ nullptr,
15251
15662
  /*.kv_overrides =*/ nullptr,
@@ -15316,7 +15727,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15316
15727
  }
15317
15728
 
15318
15729
  size_t llama_max_devices(void) {
15319
- #if defined(GGML_USE_METAL)
15730
+ #if defined(GGML_USE_RPC)
15731
+ return GGML_RPC_MAX_SERVERS;
15732
+ #elif defined(GGML_USE_METAL)
15320
15733
  return 1;
15321
15734
  #elif defined(GGML_USE_CUDA)
15322
15735
  return GGML_CUDA_MAX_DEVICES;
@@ -15339,7 +15752,7 @@ bool llama_supports_mlock(void) {
15339
15752
 
15340
15753
  bool llama_supports_gpu_offload(void) {
15341
15754
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15342
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15755
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15343
15756
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15344
15757
  return true;
15345
15758
  #else
@@ -15356,10 +15769,6 @@ void llama_backend_init(void) {
15356
15769
  struct ggml_context * ctx = ggml_init(params);
15357
15770
  ggml_free(ctx);
15358
15771
  }
15359
-
15360
- #ifdef GGML_USE_MPI
15361
- ggml_mpi_backend_init();
15362
- #endif
15363
15772
  }
15364
15773
 
15365
15774
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15369,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15369
15778
  }
15370
15779
 
15371
15780
  void llama_backend_free(void) {
15372
- #ifdef GGML_USE_MPI
15373
- ggml_mpi_backend_free();
15374
- #endif
15375
15781
  ggml_quantize_free();
15376
15782
  }
15377
15783
 
@@ -15402,7 +15808,17 @@ struct llama_model * llama_load_model_from_file(
15402
15808
  return true;
15403
15809
  };
15404
15810
  }
15405
-
15811
+ if (params.rpc_servers != nullptr) {
15812
+ // split the servers set them into model->rpc_servers
15813
+ std::string servers(params.rpc_servers);
15814
+ size_t pos = 0;
15815
+ while ((pos = servers.find(",")) != std::string::npos) {
15816
+ std::string server = servers.substr(0, pos);
15817
+ model->rpc_servers.push_back(server);
15818
+ servers.erase(0, pos + 1);
15819
+ }
15820
+ model->rpc_servers.push_back(servers);
15821
+ }
15406
15822
  int status = llama_model_load(path_model, *model, params);
15407
15823
  GGML_ASSERT(status <= 0);
15408
15824
  if (status < 0) {
@@ -15441,6 +15857,11 @@ struct llama_context * llama_new_context_with_model(
15441
15857
  return nullptr;
15442
15858
  }
15443
15859
 
15860
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15861
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15862
+ params.flash_attn = false;
15863
+ }
15864
+
15444
15865
  llama_context * ctx = new llama_context(*model);
15445
15866
 
15446
15867
  const auto & hparams = model->hparams;
@@ -15464,7 +15885,7 @@ struct llama_context * llama_new_context_with_model(
15464
15885
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15465
15886
 
15466
15887
  // this is necessary due to kv_self.n being padded later during inference
15467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15888
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15468
15889
 
15469
15890
  // with causal attention, the batch size is limited by the context size
15470
15891
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15499,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
15499
15920
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15500
15921
  }
15501
15922
 
15923
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15502
15924
  cparams.causal_attn = hparams.causal_attn;
15503
15925
 
15504
15926
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15509,16 +15931,6 @@ struct llama_context * llama_new_context_with_model(
15509
15931
  }
15510
15932
  }
15511
15933
 
15512
- if (cparams.flash_attn && hparams.use_alibi) {
15513
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
- cparams.flash_attn = false;
15515
- }
15516
-
15517
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
- cparams.flash_attn = false;
15520
- }
15521
-
15522
15934
  if (params.seed == LLAMA_DEFAULT_SEED) {
15523
15935
  params.seed = time(NULL);
15524
15936
  }
@@ -15554,7 +15966,17 @@ struct llama_context * llama_new_context_with_model(
15554
15966
 
15555
15967
  if (!hparams.vocab_only) {
15556
15968
  // initialize backends
15557
- #ifdef GGML_USE_METAL
15969
+ #if defined(GGML_USE_RPC)
15970
+ for (auto & server : model->rpc_servers) {
15971
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15972
+ if (backend == nullptr) {
15973
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15974
+ llama_free(ctx);
15975
+ return nullptr;
15976
+ }
15977
+ ctx->backends.push_back(backend);
15978
+ }
15979
+ #elif defined(GGML_USE_METAL)
15558
15980
  if (model->n_gpu_layers > 0) {
15559
15981
  ctx->backend_metal = ggml_backend_metal_init();
15560
15982
  if (ctx->backend_metal == nullptr) {
@@ -15710,7 +16132,11 @@ struct llama_context * llama_new_context_with_model(
15710
16132
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15711
16133
 
15712
16134
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15713
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
16135
+ bool pipeline_parallel =
16136
+ llama_get_device_count(*model) > 1 &&
16137
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
16138
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
16139
+ params.offload_kqv;
15714
16140
  #ifndef GGML_USE_CUDA
15715
16141
  // pipeline parallelism requires support for async compute and events
15716
16142
  // currently this is only implemented in the CUDA backend
@@ -15753,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
15753
16179
  }
15754
16180
  }
15755
16181
 
15756
- #ifdef GGML_USE_MPI
15757
- ctx->ctx_mpi = ggml_mpi_init();
15758
-
15759
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15760
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15761
- // TODO: needs fix after #3228
15762
- GGML_ASSERT(false && "not implemented");
15763
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15764
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15765
- llama_backend_free();
15766
- exit(1);
15767
- }
15768
- #endif
15769
-
15770
16182
  return ctx;
15771
16183
  }
15772
16184
 
@@ -15803,11 +16215,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15803
16215
  // these models do not use RoPE
15804
16216
  case LLM_ARCH_GPT2:
15805
16217
  case LLM_ARCH_GPTJ:
15806
- case LLM_ARCH_GPTNEOX:
15807
16218
  case LLM_ARCH_MPT:
15808
16219
  case LLM_ARCH_REFACT:
15809
16220
  case LLM_ARCH_BLOOM:
15810
16221
  case LLM_ARCH_MAMBA:
16222
+ case LLM_ARCH_JINA_BERT_V2:
15811
16223
  return LLAMA_ROPE_TYPE_NONE;
15812
16224
 
15813
16225
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -15822,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15822
16234
  case LLM_ARCH_XVERSE:
15823
16235
  case LLM_ARCH_COMMAND_R:
15824
16236
  case LLM_ARCH_OLMO:
16237
+ case LLM_ARCH_ARCTIC:
15825
16238
  return LLAMA_ROPE_TYPE_NORM;
15826
16239
 
15827
16240
  // the pairs of head values are offset by n_rot/2
15828
16241
  case LLM_ARCH_FALCON:
15829
16242
  case LLM_ARCH_GROK:
15830
16243
  case LLM_ARCH_DBRX:
15831
- case LLM_ARCH_PERSIMMON:
15832
16244
  case LLM_ARCH_BERT:
15833
16245
  case LLM_ARCH_NOMIC_BERT:
15834
16246
  case LLM_ARCH_STABLELM:
@@ -15839,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15839
16251
  case LLM_ARCH_PHI3:
15840
16252
  case LLM_ARCH_GEMMA:
15841
16253
  case LLM_ARCH_STARCODER2:
16254
+ case LLM_ARCH_GPTNEOX:
15842
16255
  return LLAMA_ROPE_TYPE_NEOX;
15843
16256
 
15844
16257
  // all model arches should be listed explicitly here
@@ -15998,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
15998
16411
  }
15999
16412
 
16000
16413
  // make tensors
16414
+ cvec.tensors.reserve(model.hparams.n_layer);
16001
16415
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16002
16416
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16003
16417
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16006,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16006
16420
  }
16007
16421
 
16008
16422
  // allocate tensors / buffers and zero
16423
+ cvec.ctxs.reserve(ctx_map.size());
16424
+ cvec.bufs.reserve(ctx_map.size());
16009
16425
  for (auto it : ctx_map) {
16010
16426
  ggml_backend_buffer_type_t buft = it.first;
16011
16427
  ggml_context * ctx = it.second;
@@ -16829,13 +17245,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16829
17245
  }
16830
17246
  else {
16831
17247
  if (cell_range_begin != kv_self.size) {
16832
- cell_ranges.push_back({ cell_range_begin, i });
17248
+ cell_ranges.emplace_back(cell_range_begin, i);
16833
17249
  cell_range_begin = kv_self.size;
16834
17250
  }
16835
17251
  }
16836
17252
  }
16837
17253
  if (cell_range_begin != kv_self.size) {
16838
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17254
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16839
17255
  }
16840
17256
 
16841
17257
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -17214,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17214
17630
  ctx->cparams.n_threads_batch = n_threads_batch;
17215
17631
  }
17216
17632
 
17633
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17634
+ return ctx->cparams.n_threads;
17635
+ }
17636
+
17637
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17638
+ return ctx->cparams.n_threads_batch;
17639
+ }
17640
+
17217
17641
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17218
17642
  ctx->abort_callback = abort_callback;
17219
17643
  ctx->abort_callback_data = abort_callback_data;
@@ -17648,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
17648
18072
  }
17649
18073
  }
17650
18074
  // llama2 templates seem to not care about "add_generation_prompt"
18075
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18076
+ // Phi 3
18077
+ for (auto message : chat) {
18078
+ std::string role(message->role);
18079
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18080
+ }
18081
+ if (add_ass) {
18082
+ ss << "<|assistant|>\n";
18083
+ }
17651
18084
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17652
18085
  // zephyr template
17653
18086
  for (auto message : chat) {
@@ -17780,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
17780
18213
  if (add_ass) {
17781
18214
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17782
18215
  }
17783
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17784
- // Phi 3
17785
- for (auto message : chat) {
17786
- std::string role(message->role);
17787
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17788
- }
17789
- if (add_ass) {
17790
- ss << "<|assistant|>\n";
17791
- }
17792
18216
  } else {
17793
18217
  // template not supported
17794
18218
  return -1;
@@ -17910,6 +18334,7 @@ const char * llama_print_system_info(void) {
17910
18334
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
17911
18335
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
17912
18336
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18337
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
17913
18338
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
17914
18339
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
17915
18340
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
@@ -17970,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
17970
18395
  g_state.log_callback_user_data = user_data;
17971
18396
  #ifdef GGML_USE_METAL
17972
18397
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18398
+ #elif defined(GGML_USE_CUDA)
18399
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
17973
18400
  #endif
17974
18401
  }
17975
18402