llama_cpp 0.15.1 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -22,16 +26,9 @@
22
26
  #ifdef GGML_USE_METAL
23
27
  # include "ggml-metal.h"
24
28
  #endif
25
- #ifdef GGML_USE_MPI
26
- # include "ggml-mpi.h"
27
- #endif
28
- #ifndef QK_K
29
- # ifdef GGML_QKK_64
30
- # define QK_K 64
31
- # else
32
- # define QK_K 256
33
- # endif
34
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
35
32
 
36
33
  #ifdef __has_include
37
34
  #if __has_include(<unistd.h>)
@@ -106,7 +103,7 @@
106
103
  #endif
107
104
 
108
105
  #define LLAMA_MAX_NODES 8192
109
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 128
110
107
 
111
108
  //
112
109
  // logging
@@ -201,10 +198,10 @@ enum llm_arch {
201
198
  LLM_ARCH_GPTNEOX,
202
199
  LLM_ARCH_MPT,
203
200
  LLM_ARCH_STARCODER,
204
- LLM_ARCH_PERSIMMON,
205
201
  LLM_ARCH_REFACT,
206
202
  LLM_ARCH_BERT,
207
203
  LLM_ARCH_NOMIC_BERT,
204
+ LLM_ARCH_JINA_BERT_V2,
208
205
  LLM_ARCH_BLOOM,
209
206
  LLM_ARCH_STABLELM,
210
207
  LLM_ARCH_QWEN,
@@ -224,43 +221,45 @@ enum llm_arch {
224
221
  LLM_ARCH_COMMAND_R,
225
222
  LLM_ARCH_DBRX,
226
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
227
225
  LLM_ARCH_UNKNOWN,
228
226
  };
229
227
 
230
228
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
229
+ { LLM_ARCH_LLAMA, "llama" },
230
+ { LLM_ARCH_FALCON, "falcon" },
231
+ { LLM_ARCH_GROK, "grok" },
232
+ { LLM_ARCH_GPT2, "gpt2" },
233
+ { LLM_ARCH_GPTJ, "gptj" },
234
+ { LLM_ARCH_GPTNEOX, "gptneox" },
235
+ { LLM_ARCH_MPT, "mpt" },
236
+ { LLM_ARCH_BAICHUAN, "baichuan" },
237
+ { LLM_ARCH_STARCODER, "starcoder" },
238
+ { LLM_ARCH_REFACT, "refact" },
239
+ { LLM_ARCH_BERT, "bert" },
240
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
241
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
242
+ { LLM_ARCH_BLOOM, "bloom" },
243
+ { LLM_ARCH_STABLELM, "stablelm" },
244
+ { LLM_ARCH_QWEN, "qwen" },
245
+ { LLM_ARCH_QWEN2, "qwen2" },
246
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
247
+ { LLM_ARCH_PHI2, "phi2" },
248
+ { LLM_ARCH_PHI3, "phi3" },
249
+ { LLM_ARCH_PLAMO, "plamo" },
250
+ { LLM_ARCH_CODESHELL, "codeshell" },
251
+ { LLM_ARCH_ORION, "orion" },
252
+ { LLM_ARCH_INTERNLM2, "internlm2" },
253
+ { LLM_ARCH_MINICPM, "minicpm" },
254
+ { LLM_ARCH_GEMMA, "gemma" },
255
+ { LLM_ARCH_STARCODER2, "starcoder2" },
256
+ { LLM_ARCH_MAMBA, "mamba" },
257
+ { LLM_ARCH_XVERSE, "xverse" },
258
+ { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
261
+ { LLM_ARCH_ARCTIC, "arctic" },
262
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
263
  };
265
264
 
266
265
  enum llm_kv {
@@ -303,6 +302,7 @@ enum llm_kv {
303
302
  LLM_KV_ROPE_SCALE_LINEAR,
304
303
  LLM_KV_ROPE_SCALING_TYPE,
305
304
  LLM_KV_ROPE_SCALING_FACTOR,
305
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
306
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
307
  LLM_KV_ROPE_SCALING_FINETUNED,
308
308
 
@@ -380,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
380
380
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
381
381
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
382
382
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
383
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
383
384
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
384
385
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
385
386
 
@@ -435,6 +436,8 @@ enum llm_tensor {
435
436
  LLM_TENSOR_OUTPUT,
436
437
  LLM_TENSOR_OUTPUT_NORM,
437
438
  LLM_TENSOR_ROPE_FREQS,
439
+ LLM_TENSOR_ROPE_FACTORS_LONG,
440
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
438
441
  LLM_TENSOR_ATTN_Q,
439
442
  LLM_TENSOR_ATTN_K,
440
443
  LLM_TENSOR_ATTN_V,
@@ -454,6 +457,7 @@ enum llm_tensor {
454
457
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
455
458
  LLM_TENSOR_FFN_GATE_EXP,
456
459
  LLM_TENSOR_FFN_UP_EXP,
460
+ LLM_TENSOR_FFN_NORM_EXPS,
457
461
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
458
462
  LLM_TENSOR_FFN_GATE_EXPS,
459
463
  LLM_TENSOR_FFN_UP_EXPS,
@@ -592,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
592
596
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
593
597
  },
594
598
  },
595
- {
596
- LLM_ARCH_PERSIMMON,
597
- {
598
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
599
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
600
- { LLM_TENSOR_OUTPUT, "output"},
601
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
602
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
603
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
604
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
605
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
606
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
607
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
608
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
609
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
610
- },
611
- },
612
599
  {
613
600
  LLM_ARCH_MPT,
614
601
  {
@@ -691,6 +678,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
678
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
679
  },
693
680
  },
681
+ {
682
+ LLM_ARCH_JINA_BERT_V2,
683
+ {
684
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
685
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
686
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
687
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
688
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
689
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
690
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
691
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
692
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
693
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
694
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
695
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
696
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
697
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
698
+ },
699
+ },
694
700
  {
695
701
  LLM_ARCH_BLOOM,
696
702
  {
@@ -800,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
800
806
  {
801
807
  LLM_ARCH_PHI3,
802
808
  {
803
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
804
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
805
- { LLM_TENSOR_OUTPUT, "output" },
806
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
807
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
808
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
809
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
810
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
811
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
812
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
813
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
814
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
809
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
810
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
811
+ { LLM_TENSOR_OUTPUT, "output" },
812
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
813
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
816
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
817
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
815
823
  },
816
824
  },
817
825
  {
@@ -1027,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1027
1035
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1028
1036
  },
1029
1037
  },
1038
+ {
1039
+ LLM_ARCH_ARCTIC,
1040
+ {
1041
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1042
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1043
+ { LLM_TENSOR_OUTPUT, "output" },
1044
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1050
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1051
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1052
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1053
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1054
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1055
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1056
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1057
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
+ },
1059
+ },
1030
1060
  {
1031
1061
  LLM_ARCH_UNKNOWN,
1032
1062
  {
@@ -1664,91 +1694,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1694
  GGML_UNUSED(host_buffer);
1665
1695
  }
1666
1696
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1697
  //
1753
1698
  // globals
1754
1699
  //
@@ -1757,6 +1702,8 @@ struct llama_state {
1757
1702
  llama_state() {
1758
1703
  #ifdef GGML_USE_METAL
1759
1704
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1705
+ #elif defined(GGML_USE_CUDA)
1706
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1760
1707
  #endif
1761
1708
  }
1762
1709
 
@@ -1770,17 +1717,24 @@ static llama_state g_state;
1770
1717
  // available llama models
1771
1718
  enum e_model {
1772
1719
  MODEL_UNKNOWN,
1720
+ MODEL_14M,
1773
1721
  MODEL_17M,
1774
1722
  MODEL_22M,
1775
1723
  MODEL_33M,
1724
+ MODEL_70M,
1776
1725
  MODEL_109M,
1777
1726
  MODEL_137M,
1727
+ MODEL_160M,
1778
1728
  MODEL_335M,
1729
+ MODEL_410M,
1779
1730
  MODEL_0_5B,
1780
1731
  MODEL_1B,
1732
+ MODEL_1_4B,
1781
1733
  MODEL_2B,
1734
+ MODEL_2_8B,
1782
1735
  MODEL_3B,
1783
1736
  MODEL_4B,
1737
+ MODEL_6_9B,
1784
1738
  MODEL_7B,
1785
1739
  MODEL_8B,
1786
1740
  MODEL_12B,
@@ -1803,6 +1757,7 @@ enum e_model {
1803
1757
  MODEL_8x7B,
1804
1758
  MODEL_8x22B,
1805
1759
  MODEL_16x12B,
1760
+ MODEL_10B_128x3_66B,
1806
1761
  };
1807
1762
 
1808
1763
  static const size_t kiB = 1024;
@@ -1812,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
1812
1767
  struct llama_hparams {
1813
1768
  bool vocab_only;
1814
1769
  bool rope_finetuned;
1770
+ bool use_par_res;
1815
1771
 
1816
1772
  uint32_t n_vocab;
1817
1773
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1830,6 +1786,7 @@ struct llama_hparams {
1830
1786
  float f_norm_eps;
1831
1787
  float f_norm_rms_eps;
1832
1788
 
1789
+ float rope_attn_factor = 1.0f;
1833
1790
  float rope_freq_base_train;
1834
1791
  float rope_freq_scale_train;
1835
1792
  uint32_t n_yarn_orig_ctx;
@@ -1845,7 +1802,7 @@ struct llama_hparams {
1845
1802
  float f_logit_scale = 0.0f;
1846
1803
 
1847
1804
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1805
+ bool use_alibi = false;
1849
1806
 
1850
1807
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1808
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1878,6 +1835,7 @@ struct llama_hparams {
1878
1835
 
1879
1836
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1880
1837
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1838
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1881
1839
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1882
1840
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1883
1841
 
@@ -1975,6 +1933,7 @@ struct llama_layer {
1975
1933
  struct ggml_tensor * ffn_norm_b;
1976
1934
  struct ggml_tensor * layer_out_norm;
1977
1935
  struct ggml_tensor * layer_out_norm_b;
1936
+ struct ggml_tensor * ffn_norm_exps;
1978
1937
 
1979
1938
  // ff
1980
1939
  struct ggml_tensor * ffn_gate; // w1
@@ -2012,6 +1971,10 @@ struct llama_layer {
2012
1971
  // mamba bias
2013
1972
  struct ggml_tensor * ssm_conv1d_b;
2014
1973
  struct ggml_tensor * ssm_dt_b;
1974
+
1975
+ // long rope factors
1976
+ struct ggml_tensor * rope_long = nullptr;
1977
+ struct ggml_tensor * rope_short = nullptr;
2015
1978
  };
2016
1979
 
2017
1980
  struct llama_kv_cell {
@@ -2189,6 +2152,8 @@ struct llama_model {
2189
2152
  int main_gpu;
2190
2153
  int n_gpu_layers;
2191
2154
 
2155
+ std::vector<std::string> rpc_servers;
2156
+
2192
2157
  // gguf metadata
2193
2158
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2159
 
@@ -2317,7 +2282,6 @@ struct llama_context {
2317
2282
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2283
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2284
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2285
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2286
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2287
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2327,11 +2291,105 @@ struct llama_context {
2327
2291
 
2328
2292
  // control vectors
2329
2293
  struct llama_control_vector cvec;
2294
+ };
2295
+
2296
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2297
+ ggml_backend_buffer_type_t buft = nullptr;
2298
+
2299
+ #ifdef GGML_USE_RPC
2300
+ std::string endpoint = model.rpc_servers[gpu];
2301
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2302
+ #elif defined(GGML_USE_METAL)
2303
+ buft = ggml_backend_metal_buffer_type();
2304
+ #elif defined(GGML_USE_CUDA)
2305
+ buft = ggml_backend_cuda_buffer_type(gpu);
2306
+ #elif defined(GGML_USE_VULKAN)
2307
+ buft = ggml_backend_vk_buffer_type(gpu);
2308
+ #elif defined(GGML_USE_SYCL)
2309
+ buft = ggml_backend_sycl_buffer_type(gpu);
2310
+ #elif defined(GGML_USE_CLBLAST)
2311
+ buft = ggml_backend_opencl_buffer_type();
2312
+ #elif defined(GGML_USE_KOMPUTE)
2313
+ buft = ggml_backend_kompute_buffer_type(gpu);
2314
+ if (buft == nullptr) {
2315
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2316
+ }
2317
+ #endif
2318
+
2319
+ if (buft == nullptr) {
2320
+ buft = llama_default_buffer_type_cpu(true);
2321
+ }
2322
+ return buft;
2323
+ GGML_UNUSED(model);
2324
+ GGML_UNUSED(gpu);
2325
+ }
2330
2326
 
2331
- #ifdef GGML_USE_MPI
2332
- ggml_mpi_context * ctx_mpi = NULL;
2327
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2328
+ ggml_backend_buffer_type_t buft = nullptr;
2329
+
2330
+ #ifdef GGML_USE_CUDA
2331
+ if (ggml_backend_cuda_get_device_count() > 1) {
2332
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2333
+ }
2333
2334
  #endif
2334
- };
2335
+
2336
+ #ifdef GGML_USE_SYCL
2337
+ if (ggml_backend_sycl_get_device_count() > 1) {
2338
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2339
+ }
2340
+ #endif
2341
+
2342
+ if (buft == nullptr) {
2343
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2344
+ }
2345
+ return buft;
2346
+
2347
+ GGML_UNUSED(tensor_split);
2348
+ }
2349
+
2350
+ static size_t llama_get_device_count(const llama_model & model) {
2351
+ #if defined(GGML_USE_RPC)
2352
+ return model.rpc_servers.size();
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ return ggml_backend_cuda_get_device_count();
2355
+ #elif defined(GGML_USE_SYCL)
2356
+ return ggml_backend_sycl_get_device_count();
2357
+ #elif defined(GGML_USE_VULKAN)
2358
+ return ggml_backend_vk_get_device_count();
2359
+ #else
2360
+ return 1;
2361
+ #endif
2362
+ GGML_UNUSED(model);
2363
+ }
2364
+
2365
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2366
+ #if defined(GGML_USE_RPC)
2367
+ size_t total;
2368
+ size_t free;
2369
+ std::string endpoint = model.rpc_servers[device];
2370
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2371
+ return free;
2372
+ #elif defined(GGML_USE_CUDA)
2373
+ size_t total;
2374
+ size_t free;
2375
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2376
+ return free;
2377
+ #elif defined(GGML_USE_SYCL)
2378
+ size_t total;
2379
+ size_t free;
2380
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2381
+ return free;
2382
+ #elif defined(GGML_USE_VULKAN)
2383
+ size_t total;
2384
+ size_t free;
2385
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2386
+ return free;
2387
+ #else
2388
+ return 1;
2389
+ #endif
2390
+ GGML_UNUSED(model);
2391
+ GGML_UNUSED(device);
2392
+ }
2335
2393
 
2336
2394
  //
2337
2395
  // kv cache helpers
@@ -2452,7 +2510,6 @@ static bool llama_kv_cache_init(
2452
2510
  static bool llama_kv_cache_find_slot(
2453
2511
  struct llama_kv_cache & cache,
2454
2512
  const struct llama_batch & batch) {
2455
- const uint32_t n_ctx = cache.size;
2456
2513
  const uint32_t n_tokens = batch.n_tokens;
2457
2514
 
2458
2515
  if (cache.recurrent) {
@@ -2503,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
2503
2560
  }
2504
2561
  // otherwise, one cell per token.
2505
2562
 
2506
- if (n_tokens > n_ctx) {
2507
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2563
+ if (n_tokens > cache.size) {
2564
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2508
2565
  return false;
2509
2566
  }
2510
2567
 
2511
2568
  uint32_t n_tested = 0;
2512
2569
 
2513
2570
  while (true) {
2514
- if (cache.head + n_tokens > n_ctx) {
2515
- n_tested += n_ctx - cache.head;
2571
+ if (cache.head + n_tokens > cache.size) {
2572
+ n_tested += cache.size - cache.head;
2516
2573
  cache.head = 0;
2517
2574
  continue;
2518
2575
  }
@@ -2531,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
2531
2588
  break;
2532
2589
  }
2533
2590
 
2534
- if (n_tested >= n_ctx) {
2591
+ if (n_tested >= cache.size) {
2535
2592
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2536
2593
  return false;
2537
2594
  }
@@ -2785,6 +2842,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2842
  cache.do_defrag = true;
2786
2843
  }
2787
2844
 
2845
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2846
+ // the FA kernels require padding to avoid extra runtime boundary checks
2847
+ return cparams.flash_attn ? 256u : 32u;
2848
+ }
2849
+
2788
2850
  //
2789
2851
  // model loading and saving
2790
2852
  //
@@ -3287,22 +3349,55 @@ struct llama_model_loader {
3287
3349
  }
3288
3350
 
3289
3351
  template<typename T>
3290
- bool get_key(const std::string & key, T & result, const bool required = true) {
3291
- auto it = kv_overrides.find(key);
3352
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3353
+ const int kid = gguf_find_key(meta, key.c_str());
3292
3354
 
3293
- const struct llama_model_kv_override * override =
3294
- it != kv_overrides.end() ? &it->second : nullptr;
3355
+ if (kid < 0) {
3356
+ if (required) {
3357
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3358
+ }
3359
+ return false;
3360
+ }
3295
3361
 
3296
- const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3362
+ struct GGUFMeta::ArrayInfo arr_info =
3363
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3297
3364
 
3298
- if (required && !found) {
3299
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3365
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3366
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3300
3367
  }
3301
3368
 
3302
- return found;
3303
- }
3369
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3370
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3371
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3304
3372
 
3305
- template<typename T>
3373
+ result.resize(arr_info.length);
3374
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3375
+
3376
+ return true;
3377
+ }
3378
+
3379
+ template<typename T>
3380
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3381
+ return get_arr(llm_kv(kid), result, required);
3382
+ }
3383
+
3384
+ template<typename T>
3385
+ bool get_key(const std::string & key, T & result, const bool required = true) {
3386
+ auto it = kv_overrides.find(key);
3387
+
3388
+ const struct llama_model_kv_override * override =
3389
+ it != kv_overrides.end() ? &it->second : nullptr;
3390
+
3391
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3392
+
3393
+ if (required && !found) {
3394
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3395
+ }
3396
+
3397
+ return found;
3398
+ }
3399
+
3400
+ template<typename T>
3306
3401
  bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
3307
3402
  return get_key(llm_kv(kid), result, required);
3308
3403
  }
@@ -3360,11 +3455,15 @@ struct llama_model_loader {
3360
3455
  return get_tensor_meta(get_tensor_name(i));
3361
3456
  }
3362
3457
 
3363
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3458
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3364
3459
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3365
3460
  ggml_set_name(tensor, ggml_get_name(cur));
3366
3461
 
3367
- n_created++;
3462
+ if (duplicated) {
3463
+ size_data += ggml_nbytes(cur);
3464
+ } else {
3465
+ n_created++;
3466
+ }
3368
3467
 
3369
3468
  return tensor;
3370
3469
  }
@@ -3399,14 +3498,17 @@ struct llama_model_loader {
3399
3498
  return cur;
3400
3499
  }
3401
3500
 
3402
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3403
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3501
+ static const int TENSOR_NOT_REQUIRED = 1;
3502
+ static const int TENSOR_DUPLICATED = 2;
3503
+
3504
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3505
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3404
3506
 
3405
3507
  if (cur == NULL) {
3406
3508
  return NULL;
3407
3509
  }
3408
3510
 
3409
- return create_tensor_for(ctx, cur);
3511
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3410
3512
  }
3411
3513
 
3412
3514
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3706,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3706
3808
 
3707
3809
  static const char * llama_model_type_name(e_model type) {
3708
3810
  switch (type) {
3709
- case MODEL_22M: return "22M";
3710
- case MODEL_33M: return "33M";
3711
- case MODEL_109M: return "109M";
3712
- case MODEL_137M: return "137M";
3713
- case MODEL_0_5B: return "0.5B";
3714
- case MODEL_1B: return "1B";
3715
- case MODEL_2B: return "2B";
3716
- case MODEL_3B: return "3B";
3717
- case MODEL_7B: return "7B";
3718
- case MODEL_8B: return "8B";
3719
- case MODEL_12B: return "12B";
3720
- case MODEL_13B: return "13B";
3721
- case MODEL_14B: return "14B";
3722
- case MODEL_15B: return "15B";
3723
- case MODEL_20B: return "20B";
3724
- case MODEL_30B: return "30B";
3725
- case MODEL_34B: return "34B";
3726
- case MODEL_35B: return "35B";
3727
- case MODEL_40B: return "40B";
3728
- case MODEL_65B: return "65B";
3729
- case MODEL_70B: return "70B";
3730
- case MODEL_314B: return "314B";
3731
- case MODEL_SMALL: return "0.1B";
3732
- case MODEL_MEDIUM: return "0.4B";
3733
- case MODEL_LARGE: return "0.8B";
3734
- case MODEL_XL: return "1.5B";
3735
- case MODEL_A2_7B: return "A2.7B";
3736
- case MODEL_8x7B: return "8x7B";
3737
- case MODEL_8x22B: return "8x22B";
3738
- case MODEL_16x12B: return "16x12B";
3739
- default: return "?B";
3811
+ case MODEL_14M: return "14M";
3812
+ case MODEL_17M: return "17M";
3813
+ case MODEL_22M: return "22M";
3814
+ case MODEL_33M: return "33M";
3815
+ case MODEL_70M: return "70M";
3816
+ case MODEL_109M: return "109M";
3817
+ case MODEL_137M: return "137M";
3818
+ case MODEL_160M: return "160M";
3819
+ case MODEL_335M: return "335M";
3820
+ case MODEL_410M: return "410M";
3821
+ case MODEL_0_5B: return "0.5B";
3822
+ case MODEL_1B: return "1B";
3823
+ case MODEL_1_4B: return "1.4B";
3824
+ case MODEL_2B: return "2B";
3825
+ case MODEL_2_8B: return "2.8B";
3826
+ case MODEL_3B: return "3B";
3827
+ case MODEL_4B: return "4B";
3828
+ case MODEL_6_9B: return "6.9B";
3829
+ case MODEL_7B: return "7B";
3830
+ case MODEL_8B: return "8B";
3831
+ case MODEL_12B: return "12B";
3832
+ case MODEL_13B: return "13B";
3833
+ case MODEL_14B: return "14B";
3834
+ case MODEL_15B: return "15B";
3835
+ case MODEL_20B: return "20B";
3836
+ case MODEL_30B: return "30B";
3837
+ case MODEL_34B: return "34B";
3838
+ case MODEL_35B: return "35B";
3839
+ case MODEL_40B: return "40B";
3840
+ case MODEL_65B: return "65B";
3841
+ case MODEL_70B: return "70B";
3842
+ case MODEL_314B: return "314B";
3843
+ case MODEL_SMALL: return "0.1B";
3844
+ case MODEL_MEDIUM: return "0.4B";
3845
+ case MODEL_LARGE: return "0.8B";
3846
+ case MODEL_XL: return "1.5B";
3847
+ case MODEL_A2_7B: return "A2.7B";
3848
+ case MODEL_8x7B: return "8x7B";
3849
+ case MODEL_8x22B: return "8x22B";
3850
+ case MODEL_16x12B: return "16x12B";
3851
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3852
+ default: return "?B";
3740
3853
  }
3741
3854
  }
3742
3855
 
@@ -3779,6 +3892,12 @@ static void llm_load_hparams(
3779
3892
 
3780
3893
  // get hparams kv
3781
3894
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3895
+
3896
+ // everything past this point is not vocab-related
3897
+ if (hparams.vocab_only) {
3898
+ return;
3899
+ }
3900
+
3782
3901
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3783
3902
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3784
3903
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3823,6 +3942,8 @@ static void llm_load_hparams(
3823
3942
  }
3824
3943
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3825
3944
 
3945
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
3946
+
3826
3947
  // sanity check for n_rot (optional)
3827
3948
  {
3828
3949
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3860,7 +3981,7 @@ static void llm_load_hparams(
3860
3981
  switch (hparams.n_layer) {
3861
3982
  case 22: model.type = e_model::MODEL_1B; break;
3862
3983
  case 26: model.type = e_model::MODEL_3B; break;
3863
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3984
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3864
3985
  case 40: model.type = e_model::MODEL_13B; break;
3865
3986
  case 48: model.type = e_model::MODEL_34B; break;
3866
3987
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3922,14 +4043,6 @@ static void llm_load_hparams(
3922
4043
  default: model.type = e_model::MODEL_UNKNOWN;
3923
4044
  }
3924
4045
  } break;
3925
- case LLM_ARCH_PERSIMMON:
3926
- {
3927
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3928
- switch (hparams.n_layer) {
3929
- case 36: model.type = e_model::MODEL_8B; break;
3930
- default: model.type = e_model::MODEL_UNKNOWN;
3931
- }
3932
- } break;
3933
4046
  case LLM_ARCH_REFACT:
3934
4047
  {
3935
4048
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3962,6 +4075,19 @@ static void llm_load_hparams(
3962
4075
  model.type = e_model::MODEL_335M; break; // bge-large
3963
4076
  }
3964
4077
  } break;
4078
+ case LLM_ARCH_JINA_BERT_V2:
4079
+ {
4080
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4081
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4082
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4083
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4084
+ hparams.f_max_alibi_bias = 8.0f;
4085
+
4086
+ switch (hparams.n_layer) {
4087
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4088
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4089
+ }
4090
+ } break;
3965
4091
  case LLM_ARCH_NOMIC_BERT:
3966
4092
  {
3967
4093
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4058,6 +4184,7 @@ static void llm_load_hparams(
4058
4184
  switch (hparams.n_layer) {
4059
4185
  case 24: model.type = e_model::MODEL_1B; break;
4060
4186
  case 32: model.type = e_model::MODEL_3B; break;
4187
+ case 40: model.type = e_model::MODEL_14B; break;
4061
4188
  default: model.type = e_model::MODEL_UNKNOWN;
4062
4189
  }
4063
4190
  } break;
@@ -4198,6 +4325,65 @@ static void llm_load_hparams(
4198
4325
  default: model.type = e_model::MODEL_UNKNOWN;
4199
4326
  }
4200
4327
  } break;
4328
+ case LLM_ARCH_GPTNEOX:
4329
+ {
4330
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4331
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4332
+ switch (hparams.n_layer) {
4333
+ case 6:
4334
+ switch (hparams.n_ff) {
4335
+ case 512: model.type = e_model::MODEL_14M; break;
4336
+ case 2048: model.type = e_model::MODEL_70M; break;
4337
+ default: model.type = e_model::MODEL_UNKNOWN;
4338
+ } break;
4339
+ case 12:
4340
+ switch (hparams.n_ff) {
4341
+ case 3072: model.type = e_model::MODEL_160M; break;
4342
+ default: model.type = e_model::MODEL_UNKNOWN;
4343
+ } break;
4344
+ case 16:
4345
+ switch (hparams.n_ff) {
4346
+ case 8192: model.type = e_model::MODEL_1B; break;
4347
+ default: model.type = e_model::MODEL_UNKNOWN;
4348
+ } break;
4349
+ case 24:
4350
+ switch (hparams.n_ff) {
4351
+ case 4096: model.type = e_model::MODEL_410M; break;
4352
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4353
+ default: model.type = e_model::MODEL_UNKNOWN;
4354
+ } break;
4355
+ case 32:
4356
+ switch (hparams.n_ff) {
4357
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4358
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4359
+ default: model.type = e_model::MODEL_UNKNOWN;
4360
+ } break;
4361
+ case 36:
4362
+ switch (hparams.n_ff) {
4363
+ case 20480: model.type = e_model::MODEL_12B; break;
4364
+ default: model.type = e_model::MODEL_UNKNOWN;
4365
+ } break;
4366
+ case 44:
4367
+ switch (hparams.n_ff) {
4368
+ case 24576: model.type = e_model::MODEL_20B; break;
4369
+ default: model.type = e_model::MODEL_UNKNOWN;
4370
+ } break;
4371
+ default: model.type = e_model::MODEL_UNKNOWN;
4372
+ }
4373
+ } break;
4374
+ case LLM_ARCH_ARCTIC:
4375
+ {
4376
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4377
+
4378
+ if (hparams.n_expert == 128) {
4379
+ switch (hparams.n_layer) {
4380
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4381
+ default: model.type = e_model::MODEL_UNKNOWN;
4382
+ }
4383
+ } else {
4384
+ model.type = e_model::MODEL_UNKNOWN;
4385
+ }
4386
+ } break;
4201
4387
  default: (void)0;
4202
4388
  }
4203
4389
 
@@ -4383,7 +4569,11 @@ static void llm_load_vocab(
4383
4569
  tokenizer_pre == "starcoder") {
4384
4570
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
4571
  } else if (
4386
- tokenizer_pre == "gpt-2") {
4572
+ tokenizer_pre == "gpt-2" ||
4573
+ tokenizer_pre == "jina-es" ||
4574
+ tokenizer_pre == "jina-de" ||
4575
+ tokenizer_pre == "jina-v2-es" ||
4576
+ tokenizer_pre == "jina-v2-de") {
4387
4577
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
4578
  } else if (
4389
4579
  tokenizer_pre == "refact") {
@@ -4394,6 +4584,9 @@ static void llm_load_vocab(
4394
4584
  } else if (
4395
4585
  tokenizer_pre == "qwen2") {
4396
4586
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4587
+ } else if (
4588
+ tokenizer_pre == "stablelm2") {
4589
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4397
4590
  } else if (
4398
4591
  tokenizer_pre == "olmo") {
4399
4592
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -4515,7 +4708,8 @@ static void llm_load_vocab(
4515
4708
  (t.first == "<|eot_id|>" ||
4516
4709
  t.first == "<|im_end|>" ||
4517
4710
  t.first == "<|end|>" ||
4518
- t.first == "<end_of_turn>"
4711
+ t.first == "<end_of_turn>" ||
4712
+ t.first == "<|endoftext|>"
4519
4713
  )
4520
4714
  ) {
4521
4715
  vocab.special_eot_id = t.second;
@@ -4743,13 +4937,13 @@ static bool llm_load_tensors(
4743
4937
 
4744
4938
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4745
4939
  // calculate the split points
4746
- int device_count = llama_get_device_count();
4940
+ int device_count = llama_get_device_count(model);
4747
4941
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4748
4942
  std::vector<float> splits(device_count);
4749
4943
  if (all_zero) {
4750
4944
  // default split, by free memory
4751
4945
  for (int i = 0; i < device_count; ++i) {
4752
- splits[i] = llama_get_device_memory(i);
4946
+ splits[i] = llama_get_device_memory(model, i);
4753
4947
  }
4754
4948
  } else {
4755
4949
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4769,35 +4963,35 @@ static bool llm_load_tensors(
4769
4963
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4770
4964
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4771
4965
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4772
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4966
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4773
4967
  }
4774
4968
  // assign the output layer
4775
4969
  if (n_gpu_layers > n_layer) {
4776
4970
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4777
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4971
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4778
4972
  } else {
4779
4973
  model.buft_output = llama_default_buffer_type_cpu(true);
4780
4974
  }
4781
4975
  } else {
4782
4976
  ggml_backend_buffer_type_t split_buft;
4783
4977
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4784
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4978
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4785
4979
  } else {
4786
4980
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4787
- split_buft = llama_default_buffer_type_offload(main_gpu);
4981
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4788
4982
  }
4789
4983
  // assign the repeating layers
4790
4984
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4791
4985
  model.buft_layer[i] = {
4792
4986
  split_buft,
4793
- llama_default_buffer_type_offload(main_gpu)
4987
+ llama_default_buffer_type_offload(model, main_gpu)
4794
4988
  };
4795
4989
  }
4796
4990
  // assign the output layer
4797
4991
  if (n_gpu_layers > n_layer) {
4798
4992
  model.buft_output = {
4799
4993
  split_buft,
4800
- llama_default_buffer_type_offload(main_gpu)
4994
+ llama_default_buffer_type_offload(model, main_gpu)
4801
4995
  };
4802
4996
  } else {
4803
4997
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -4841,6 +5035,7 @@ static bool llm_load_tensors(
4841
5035
  // create tensors for the weights
4842
5036
  {
4843
5037
  const int64_t n_embd = hparams.n_embd;
5038
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4844
5039
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4845
5040
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4846
5041
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4875,12 +5070,10 @@ static bool llm_load_tensors(
4875
5070
  {
4876
5071
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4877
5072
  if (model.arch != LLM_ARCH_MINICPM){
4878
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5073
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4879
5074
  // if output is NULL, init from the input tok embed
4880
5075
  if (model.output == NULL) {
4881
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4882
- ml.n_created--; // artificial tensor
4883
- ml.size_data += ggml_nbytes(model.output);
5076
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4884
5077
  }
4885
5078
  }
4886
5079
  }
@@ -4899,10 +5092,10 @@ static bool llm_load_tensors(
4899
5092
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4900
5093
 
4901
5094
  // optional bias tensors
4902
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4903
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4904
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4905
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5095
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5096
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5097
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5098
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4906
5099
 
4907
5100
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4908
5101
 
@@ -4913,7 +5106,7 @@ static bool llm_load_tensors(
4913
5106
  } else {
4914
5107
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4915
5108
 
4916
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5109
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4917
5110
  if (layer.ffn_gate_exps) {
4918
5111
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4919
5112
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -4955,12 +5148,10 @@ static bool llm_load_tensors(
4955
5148
  // output
4956
5149
  {
4957
5150
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4958
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5151
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4959
5152
  // if output is NULL, init from the input tok embed
4960
5153
  if (model.output == NULL) {
4961
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4962
- ml.n_created--; // artificial tensor
4963
- ml.size_data += ggml_nbytes(model.output);
5154
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4964
5155
  }
4965
5156
  }
4966
5157
 
@@ -4983,7 +5174,7 @@ static bool llm_load_tensors(
4983
5174
 
4984
5175
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4985
5176
 
4986
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5177
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4987
5178
  if (layer.ffn_gate_exps) {
4988
5179
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4989
5180
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5085,11 +5276,9 @@ static bool llm_load_tensors(
5085
5276
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5086
5277
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5087
5278
 
5088
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5279
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5089
5280
  if (!model.output) {
5090
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5091
- ml.n_created--; // artificial tensor
5092
- ml.size_data += ggml_nbytes(model.output);
5281
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5093
5282
  }
5094
5283
  }
5095
5284
 
@@ -5102,8 +5291,8 @@ static bool llm_load_tensors(
5102
5291
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5103
5292
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5104
5293
 
5105
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5106
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5294
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5295
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5107
5296
 
5108
5297
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5109
5298
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5121,7 +5310,12 @@ static bool llm_load_tensors(
5121
5310
  {
5122
5311
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5123
5312
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5124
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5313
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5314
+ if (!model.output) {
5315
+ // needs to be on GPU
5316
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5317
+ }
5318
+
5125
5319
  }
5126
5320
 
5127
5321
  for (int i = 0; i < n_layer; ++i) {
@@ -5149,47 +5343,6 @@ static bool llm_load_tensors(
5149
5343
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5150
5344
  }
5151
5345
  } break;
5152
- case LLM_ARCH_PERSIMMON:
5153
- {
5154
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5155
-
5156
- {
5157
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5158
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5159
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5160
- }
5161
-
5162
- for (int i = 0; i < n_layer; ++i) {
5163
- ggml_context * ctx_layer = ctx_for_layer(i);
5164
- ggml_context * ctx_split = ctx_for_layer_split(i);
5165
-
5166
- auto & layer = model.layers[i];
5167
-
5168
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5169
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5170
-
5171
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5172
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5173
-
5174
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5175
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5176
-
5177
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5178
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5179
-
5180
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5181
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5182
-
5183
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5184
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5185
-
5186
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5187
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5188
-
5189
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5190
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5191
- }
5192
- } break;
5193
5346
  case LLM_ARCH_BERT:
5194
5347
  case LLM_ARCH_NOMIC_BERT:
5195
5348
  {
@@ -5242,6 +5395,50 @@ static bool llm_load_tensors(
5242
5395
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5243
5396
  }
5244
5397
  } break;
5398
+ case LLM_ARCH_JINA_BERT_V2:
5399
+ {
5400
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5401
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5402
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5403
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5404
+
5405
+ for (int i = 0; i < n_layer; ++i) {
5406
+ ggml_context * ctx_layer = ctx_for_layer(i);
5407
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5408
+
5409
+ auto & layer = model.layers[i]; // JinaBertLayer
5410
+
5411
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5412
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5413
+
5414
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5416
+
5417
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5418
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5419
+
5420
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5421
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5422
+
5423
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5424
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5425
+
5426
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5427
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5428
+
5429
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5430
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5431
+
5432
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5433
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5434
+
5435
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5436
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5437
+
5438
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5439
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5440
+ }
5441
+ } break;
5245
5442
  case LLM_ARCH_BLOOM:
5246
5443
  {
5247
5444
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5283,18 +5480,16 @@ static bool llm_load_tensors(
5283
5480
  case LLM_ARCH_MPT:
5284
5481
  {
5285
5482
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5286
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5483
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5287
5484
 
5288
5485
  // output
5289
5486
  {
5290
5487
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5291
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5488
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5292
5489
 
5293
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5490
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5294
5491
  if (!model.output) {
5295
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5296
- ml.n_created--; // artificial tensor
5297
- ml.size_data += ggml_nbytes(model.output);
5492
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5298
5493
  }
5299
5494
  }
5300
5495
 
@@ -5305,31 +5500,31 @@ static bool llm_load_tensors(
5305
5500
  auto & layer = model.layers[i];
5306
5501
 
5307
5502
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5308
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5503
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5309
5504
 
5310
5505
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5311
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5506
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5312
5507
 
5313
5508
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5314
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5509
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5315
5510
 
5316
5511
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5317
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5512
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5318
5513
 
5319
5514
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5320
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5515
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5321
5516
 
5322
5517
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5323
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5518
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5324
5519
 
5325
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5326
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5520
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5521
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5327
5522
 
5328
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5523
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5524
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5525
 
5331
5526
  // AWQ ScaleActivation layer
5332
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5527
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5333
5528
  }
5334
5529
  } break;
5335
5530
  case LLM_ARCH_STABLELM:
@@ -5358,17 +5553,17 @@ static bool llm_load_tensors(
5358
5553
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5359
5554
 
5360
5555
  // optional bias tensors, present in Stable LM 2 1.6B
5361
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5362
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5363
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5556
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5557
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5558
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5364
5559
 
5365
5560
  // optional q and k layernorms, present in StableLM 2 12B
5366
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5367
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5561
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5562
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5368
5563
 
5369
5564
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5370
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5371
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5565
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5566
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5372
5567
 
5373
5568
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5374
5569
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5411,12 +5606,10 @@ static bool llm_load_tensors(
5411
5606
  // output
5412
5607
  {
5413
5608
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5414
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5609
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
5610
  // if output is NULL, init from the input tok embed
5416
5611
  if (model.output == NULL) {
5417
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5418
- ml.n_created--; // artificial tensor
5419
- ml.size_data += ggml_nbytes(model.output);
5612
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5420
5613
  }
5421
5614
  }
5422
5615
 
@@ -5514,8 +5707,8 @@ static bool llm_load_tensors(
5514
5707
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5515
5708
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5516
5709
 
5517
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5518
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5710
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5711
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5519
5712
 
5520
5713
  if (layer.wqkv == nullptr) {
5521
5714
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5552,17 +5745,20 @@ static bool llm_load_tensors(
5552
5745
  ggml_context* ctx_layer = ctx_for_layer(i);
5553
5746
  ggml_context* ctx_split = ctx_for_layer_split(i);
5554
5747
 
5555
- auto& layer = model.layers[i];
5748
+ auto & layer = model.layers[i];
5556
5749
 
5557
5750
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5558
5751
 
5559
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5560
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5752
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5753
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5561
5754
 
5562
5755
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5563
5756
 
5564
5757
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5565
5758
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5759
+
5760
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5761
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5566
5762
  }
5567
5763
  } break;
5568
5764
  case LLM_ARCH_PLAMO:
@@ -5731,9 +5927,7 @@ static bool llm_load_tensors(
5731
5927
 
5732
5928
  // output
5733
5929
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5734
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5735
- ml.n_created--; // artificial tensor
5736
- ml.size_data += ggml_nbytes(model.output);
5930
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5737
5931
 
5738
5932
  const int64_t n_ff = hparams.n_ff;
5739
5933
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5768,12 +5962,10 @@ static bool llm_load_tensors(
5768
5962
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5769
5963
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5770
5964
 
5771
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5965
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5772
5966
  // if output is NULL, init from the input tok embed
5773
5967
  if (model.output == NULL) {
5774
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5775
- ml.n_created--; // artificial tensor
5776
- ml.size_data += ggml_nbytes(model.output);
5968
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5777
5969
  }
5778
5970
 
5779
5971
  }
@@ -5824,12 +6016,10 @@ static bool llm_load_tensors(
5824
6016
  {
5825
6017
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5826
6018
 
5827
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6019
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5828
6020
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5829
6021
  if (model.output == NULL) {
5830
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5831
- ml.n_created--; // artificial tensor
5832
- ml.size_data += ggml_nbytes(model.output);
6022
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5833
6023
  }
5834
6024
  }
5835
6025
 
@@ -5890,9 +6080,7 @@ static bool llm_load_tensors(
5890
6080
  {
5891
6081
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5892
6082
  // init output from the input tok embed
5893
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5894
- ml.n_created--; // artificial tensor
5895
- ml.size_data += ggml_nbytes(model.output);
6083
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5896
6084
  }
5897
6085
 
5898
6086
  for (int i = 0; i < n_layer; ++i) {
@@ -5924,12 +6112,10 @@ static bool llm_load_tensors(
5924
6112
 
5925
6113
  // output
5926
6114
  {
5927
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6115
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5928
6116
  // if output is NULL, init from the input tok embed
5929
6117
  if (model.output == NULL) {
5930
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5931
- ml.n_created--; // artificial tensor
5932
- ml.size_data += ggml_nbytes(model.output);
6118
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5933
6119
  }
5934
6120
  }
5935
6121
 
@@ -5949,6 +6135,81 @@ static bool llm_load_tensors(
5949
6135
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5950
6136
  }
5951
6137
  } break;
6138
+ case LLM_ARCH_GPTNEOX:
6139
+ {
6140
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6141
+ // output
6142
+ {
6143
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6144
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6145
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6146
+ }
6147
+
6148
+ for (int i = 0; i < n_layer; ++i) {
6149
+ ggml_context * ctx_layer = ctx_for_layer(i);
6150
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6151
+
6152
+ auto & layer = model.layers[i];
6153
+
6154
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6155
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6156
+
6157
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6158
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6159
+
6160
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6161
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6162
+
6163
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6164
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6165
+
6166
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6167
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6168
+
6169
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6170
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6171
+ }
6172
+ } break;
6173
+ case LLM_ARCH_ARCTIC:
6174
+ {
6175
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6176
+
6177
+ // output
6178
+ {
6179
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6180
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6181
+ // if output is NULL, init from the input tok embed
6182
+ if (model.output == NULL) {
6183
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6184
+ }
6185
+ }
6186
+
6187
+ for (int i = 0; i < n_layer; ++i) {
6188
+ ggml_context * ctx_layer = ctx_for_layer(i);
6189
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6190
+
6191
+ auto & layer = model.layers[i];
6192
+
6193
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6194
+
6195
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6196
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6197
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6198
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6199
+
6200
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6201
+
6202
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6203
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6204
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6205
+
6206
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6207
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6208
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6209
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6210
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
+ }
6212
+ } break;
5952
6213
  default:
5953
6214
  throw std::runtime_error("unknown architecture");
5954
6215
  }
@@ -6213,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6213
6474
 
6214
6475
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6215
6476
  } else {
6216
- #ifdef GGML_USE_MPI
6217
- GGML_ASSERT(false && "not implemented");
6218
- #endif
6219
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6477
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6220
6478
  inpL = lctx.inp_embd;
6221
6479
  ggml_set_input(lctx.inp_embd);
6222
6480
  }
@@ -6318,7 +6576,7 @@ static struct ggml_tensor * llm_build_ffn(
6318
6576
  llm_ffn_gate_type type_gate,
6319
6577
  const llm_build_cb & cb,
6320
6578
  int il) {
6321
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6579
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6322
6580
  cb(tmp, "ffn_up", il);
6323
6581
 
6324
6582
  if (up_b) {
@@ -6500,7 +6758,6 @@ static struct ggml_tensor * llm_build_kqv(
6500
6758
  struct ggml_tensor * wo_b,
6501
6759
  struct ggml_tensor * q_cur,
6502
6760
  struct ggml_tensor * kq_mask,
6503
- struct ggml_tensor * kq_pos,
6504
6761
  int32_t n_tokens,
6505
6762
  int32_t n_kv,
6506
6763
  float kq_scale,
@@ -6512,6 +6769,7 @@ static struct ggml_tensor * llm_build_kqv(
6512
6769
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6513
6770
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6514
6771
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6772
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6515
6773
 
6516
6774
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6517
6775
  cb(q, "q", il);
@@ -6530,31 +6788,27 @@ static struct ggml_tensor * llm_build_kqv(
6530
6788
  GGML_UNUSED(model);
6531
6789
  GGML_UNUSED(n_ctx);
6532
6790
 
6533
- // note: if this assert triggers, then some check has failed earlier
6534
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6536
-
6537
6791
  // split cached v into n_head heads (not transposed)
6538
6792
  struct ggml_tensor * v =
6539
6793
  ggml_view_3d(ctx, kv.v_l[il],
6540
6794
  n_embd_head_v, n_kv, n_head_kv,
6541
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6795
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6796
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6543
6797
  0);
6544
6798
  cb(v, "v", il);
6545
6799
 
6546
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6800
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6547
6801
 
6548
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6802
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6549
6803
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
6804
  }
6551
6805
 
6552
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6806
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6553
6807
  } else {
6554
6808
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
6809
  cb(kq, "kq", il);
6556
6810
 
6557
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6811
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6558
6812
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6559
6813
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6560
6814
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6574,28 +6828,8 @@ static struct ggml_tensor * llm_build_kqv(
6574
6828
  kq = ggml_scale(ctx, kq, 30);
6575
6829
  }
6576
6830
 
6577
- #if defined(GGML_USE_KOMPUTE)
6578
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6579
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6580
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6581
- if (hparams.use_alibi) {
6582
- kq = ggml_scale(ctx, kq, kq_scale);
6583
- cb(kq, "kq_scaled", il);
6584
-
6585
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
- cb(kq, "kq_scaled_alibi", il);
6587
-
6588
- kq = ggml_add(ctx, kq, kq_mask);
6589
- cb(kq, "kq_masked", il);
6590
-
6591
- kq = ggml_soft_max(ctx, kq);
6592
- cb(kq, "kq_soft_max", il);
6593
- } else
6594
- #endif
6595
- {
6596
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
- cb(kq, "kq_soft_max_ext", il);
6598
- }
6831
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6832
+ cb(kq, "kq_soft_max_ext", il);
6599
6833
 
6600
6834
  GGML_ASSERT(kv.size == n_ctx);
6601
6835
 
@@ -6614,7 +6848,7 @@ static struct ggml_tensor * llm_build_kqv(
6614
6848
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
6849
  cb(kqv_merged, "kqv_merged", il);
6616
6850
 
6617
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6851
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6618
6852
  cb(cur, "kqv_merged_cont", il);
6619
6853
  }
6620
6854
 
@@ -6645,7 +6879,6 @@ static struct ggml_tensor * llm_build_kv(
6645
6879
  struct ggml_tensor * v_cur,
6646
6880
  struct ggml_tensor * q_cur,
6647
6881
  struct ggml_tensor * kq_mask,
6648
- struct ggml_tensor * kq_pos,
6649
6882
  int32_t n_tokens,
6650
6883
  int32_t kv_head,
6651
6884
  int32_t n_kv,
@@ -6664,7 +6897,7 @@ static struct ggml_tensor * llm_build_kv(
6664
6897
  struct ggml_tensor * cur;
6665
6898
 
6666
6899
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6900
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6668
6901
  cb(cur, "kqv_out", il);
6669
6902
 
6670
6903
  return cur;
@@ -6771,18 +7004,17 @@ struct llm_build_context {
6771
7004
 
6772
7005
  ctx0 = ggml_init(params);
6773
7006
 
6774
- lctx.inp_tokens = nullptr;
6775
- lctx.inp_embd = nullptr;
6776
- lctx.inp_pos = nullptr;
7007
+ lctx.inp_tokens = nullptr;
7008
+ lctx.inp_embd = nullptr;
7009
+ lctx.inp_pos = nullptr;
6777
7010
  lctx.inp_out_ids = nullptr;
6778
7011
  lctx.inp_KQ_mask = nullptr;
6779
- lctx.inp_KQ_pos = nullptr;
6780
7012
  lctx.inp_K_shift = nullptr;
6781
- lctx.inp_mean = nullptr;
6782
- lctx.inp_cls = nullptr;
6783
- lctx.inp_s_copy = nullptr;
6784
- lctx.inp_s_mask = nullptr;
6785
- lctx.inp_s_seq = nullptr;
7013
+ lctx.inp_mean = nullptr;
7014
+ lctx.inp_cls = nullptr;
7015
+ lctx.inp_s_copy = nullptr;
7016
+ lctx.inp_s_mask = nullptr;
7017
+ lctx.inp_s_seq = nullptr;
6786
7018
  }
6787
7019
 
6788
7020
  void free() {
@@ -6801,17 +7033,20 @@ struct llm_build_context {
6801
7033
  cb(lctx.inp_K_shift, "K_shift", -1);
6802
7034
  ggml_set_input(lctx.inp_K_shift);
6803
7035
 
7036
+
6804
7037
  for (int il = 0; il < n_layer; ++il) {
7038
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6805
7039
  struct ggml_tensor * tmp =
6806
7040
  // we rotate only the first n_rot dimensions
6807
- ggml_rope_custom_inplace(ctx0,
7041
+ ggml_rope_ext_inplace(ctx0,
6808
7042
  ggml_view_3d(ctx0, kv_self.k_l[il],
6809
7043
  n_embd_head_k, n_head_kv, n_ctx,
6810
7044
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6811
7045
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6812
7046
  0),
6813
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6814
7048
  ext_factor, attn_factor, beta_fast, beta_slow);
7049
+
6815
7050
  cb(tmp, "K_shifted", il);
6816
7051
  ggml_build_forward_expand(gf, tmp);
6817
7052
  }
@@ -6914,6 +7149,17 @@ struct llm_build_context {
6914
7149
  return lctx.inp_pos;
6915
7150
  }
6916
7151
 
7152
+ struct ggml_tensor * build_rope_factors(int il) {
7153
+ // choose long/short freq factors based on the context size
7154
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7155
+
7156
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7157
+ return model.layers[il].rope_long;
7158
+ }
7159
+
7160
+ return model.layers[il].rope_short;
7161
+ }
7162
+
6917
7163
  struct ggml_tensor * build_inp_out_ids() {
6918
7164
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6919
7165
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -6932,19 +7178,6 @@ struct llm_build_context {
6932
7178
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6933
7179
  }
6934
7180
 
6935
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
- if (causal) {
6937
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
- } else {
6939
- // TODO: this will be needed for ALiBi-based BERT models
6940
- // https://github.com/ggerganov/llama.cpp/pull/6826
6941
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
- }
6943
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6944
- ggml_set_input(lctx.inp_KQ_pos);
6945
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6946
- }
6947
-
6948
7181
  struct ggml_tensor * build_inp_mean() {
6949
7182
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6950
7183
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7034,15 +7267,15 @@ struct llm_build_context {
7034
7267
  cb(Vcur, "Vcur", il);
7035
7268
  }
7036
7269
 
7037
- Qcur = ggml_rope_custom(
7038
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7270
+ Qcur = ggml_rope_ext(
7271
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7039
7272
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7040
7273
  ext_factor, attn_factor, beta_fast, beta_slow
7041
7274
  );
7042
7275
  cb(Qcur, "Qcur", il);
7043
7276
 
7044
- Kcur = ggml_rope_custom(
7045
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7277
+ Kcur = ggml_rope_ext(
7278
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7046
7279
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
7280
  ext_factor, attn_factor, beta_fast, beta_slow
7048
7281
  );
@@ -7050,7 +7283,7 @@ struct llm_build_context {
7050
7283
 
7051
7284
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7052
7285
  model.layers[il].wo, model.layers[il].bo,
7053
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7286
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7054
7287
  }
7055
7288
 
7056
7289
  if (il == n_layer - 1) {
@@ -7143,9 +7376,6 @@ struct llm_build_context {
7143
7376
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7144
7377
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7145
7378
 
7146
- // positions of the tokens in the KV cache
7147
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7148
-
7149
7379
  for (int il = 0; il < n_layer; ++il) {
7150
7380
  struct ggml_tensor * inpSA = inpL;
7151
7381
 
@@ -7167,13 +7397,13 @@ struct llm_build_context {
7167
7397
 
7168
7398
  switch (model.type) {
7169
7399
  case MODEL_7B:
7170
- Qcur = ggml_rope_custom(
7171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ Qcur = ggml_rope_ext(
7401
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7172
7402
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7173
7403
  ext_factor, attn_factor, beta_fast, beta_slow
7174
7404
  );
7175
- Kcur = ggml_rope_custom(
7176
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7405
+ Kcur = ggml_rope_ext(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7177
7407
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7178
7408
  ext_factor, attn_factor, beta_fast, beta_slow
7179
7409
  );
@@ -7190,7 +7420,7 @@ struct llm_build_context {
7190
7420
 
7191
7421
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7192
7422
  model.layers[il].wo, NULL,
7193
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7423
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7194
7424
  }
7195
7425
 
7196
7426
  if (il == n_layer - 1) {
@@ -7260,9 +7490,6 @@ struct llm_build_context {
7260
7490
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7261
7491
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7262
7492
 
7263
- // positions of the tokens in the KV cache
7264
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7265
-
7266
7493
  for (int il = 0; il < n_layer; ++il) {
7267
7494
  struct ggml_tensor * inpSA = inpL;
7268
7495
 
@@ -7282,22 +7509,22 @@ struct llm_build_context {
7282
7509
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7283
7510
  cb(Vcur, "Vcur", il);
7284
7511
 
7285
- Qcur = ggml_rope_custom(
7286
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7512
+ Qcur = ggml_rope_ext(
7513
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7287
7514
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7288
7515
  ext_factor, attn_factor, beta_fast, beta_slow
7289
7516
  );
7290
7517
  cb(Qcur, "Qcur", il);
7291
7518
 
7292
- Kcur = ggml_rope_custom(
7293
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7519
+ Kcur = ggml_rope_ext(
7520
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7294
7521
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7295
7522
  ext_factor, attn_factor, beta_fast, beta_slow
7296
7523
  );
7297
7524
  cb(Kcur, "Kcur", il);
7298
7525
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7299
7526
  model.layers[il].wo, NULL,
7300
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7527
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7301
7528
  }
7302
7529
 
7303
7530
  if (il == n_layer - 1) {
@@ -7403,21 +7630,21 @@ struct llm_build_context {
7403
7630
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7404
7631
 
7405
7632
  // using mode = 2 for neox mode
7406
- Qcur = ggml_rope_custom(
7407
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7633
+ Qcur = ggml_rope_ext(
7634
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7408
7635
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7409
7636
  );
7410
7637
  cb(Qcur, "Qcur", il);
7411
7638
 
7412
- Kcur = ggml_rope_custom(
7413
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7639
+ Kcur = ggml_rope_ext(
7640
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7414
7641
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7415
7642
  );
7416
7643
  cb(Kcur, "Kcur", il);
7417
7644
 
7418
7645
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7419
7646
  model.layers[il].wo, NULL,
7420
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7647
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7421
7648
  }
7422
7649
 
7423
7650
  if (il == n_layer - 1) {
@@ -7526,15 +7753,15 @@ struct llm_build_context {
7526
7753
  cb(Vcur, "Vcur", il);
7527
7754
  }
7528
7755
 
7529
- Qcur = ggml_rope_custom(
7530
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7756
+ Qcur = ggml_rope_ext(
7757
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7531
7758
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7532
7759
  ext_factor, attn_factor, beta_fast, beta_slow
7533
7760
  );
7534
7761
  cb(Qcur, "Qcur", il);
7535
7762
 
7536
- Kcur = ggml_rope_custom(
7537
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7763
+ Kcur = ggml_rope_ext(
7764
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7538
7765
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7539
7766
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7767
  );
@@ -7542,7 +7769,7 @@ struct llm_build_context {
7542
7769
 
7543
7770
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7544
7771
  model.layers[il].wo, model.layers[il].bo,
7545
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7772
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7546
7773
  }
7547
7774
 
7548
7775
  if (il == n_layer - 1) {
@@ -7678,15 +7905,15 @@ struct llm_build_context {
7678
7905
  cb(Kcur, "Kcur", il);
7679
7906
  cb(Vcur, "Vcur", il);
7680
7907
 
7681
- Qcur = ggml_rope_custom(
7682
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7908
+ Qcur = ggml_rope_ext(
7909
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7683
7910
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7684
7911
  ext_factor, attn_factor, beta_fast, beta_slow
7685
7912
  );
7686
7913
  cb(Qcur, "Qcur", il);
7687
7914
 
7688
- Kcur = ggml_rope_custom(
7689
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7915
+ Kcur = ggml_rope_ext(
7916
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7690
7917
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7691
7918
  ext_factor, attn_factor, beta_fast, beta_slow
7692
7919
  );
@@ -7694,7 +7921,7 @@ struct llm_build_context {
7694
7921
 
7695
7922
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
7923
  model.layers[il].wo, NULL,
7697
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7924
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7698
7925
  }
7699
7926
 
7700
7927
  if (il == n_layer - 1) {
@@ -7806,7 +8033,7 @@ struct llm_build_context {
7806
8033
 
7807
8034
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7808
8035
  model.layers[il].wo, model.layers[il].bo,
7809
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8036
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7810
8037
  }
7811
8038
 
7812
8039
  if (il == n_layer - 1) {
@@ -7855,266 +8082,56 @@ struct llm_build_context {
7855
8082
  return gf;
7856
8083
  }
7857
8084
 
7858
- struct ggml_cgraph * build_persimmon() {
8085
+ struct ggml_cgraph * build_refact() {
7859
8086
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7860
8087
 
7861
8088
  const int64_t n_embd_head = hparams.n_embd_head_v;
7862
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7863
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
8089
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7864
8090
 
7865
8091
  struct ggml_tensor * cur;
7866
8092
  struct ggml_tensor * inpL;
7867
8093
 
7868
8094
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7869
8095
 
7870
- // inp_pos - contains the positions
7871
- struct ggml_tensor * inp_pos = build_inp_pos();
7872
-
7873
8096
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7874
8097
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7875
8098
 
7876
8099
  for (int il = 0; il < n_layer; ++il) {
7877
- struct ggml_tensor * residual = inpL;
8100
+ struct ggml_tensor * inpSA = inpL;
7878
8101
 
7879
8102
  cur = llm_build_norm(ctx0, inpL, hparams,
7880
- model.layers[il].attn_norm,
7881
- model.layers[il].attn_norm_b,
7882
- LLM_NORM, cb, il);
8103
+ model.layers[il].attn_norm, NULL,
8104
+ LLM_NORM_RMS, cb, il);
7883
8105
  cb(cur, "attn_norm", il);
7884
8106
 
7885
- // self attention
8107
+ // self-attention
7886
8108
  {
7887
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7888
- cb(cur, "wqkv", il);
8109
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8110
+ cb(Qcur, "Qcur", il);
7889
8111
 
7890
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7891
- cb(cur, "bqkv", il);
8112
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8113
+ cb(Kcur, "Kcur", il);
7892
8114
 
7893
- // split qkv
7894
- GGML_ASSERT(n_head_kv == n_head);
8115
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8116
+ cb(Vcur, "Vcur", il);
7895
8117
 
7896
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7897
- cb(tmpqkv, "tmpqkv", il);
8118
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8119
+ cb(Kcur, "Kcur", il);
7898
8120
 
7899
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7900
- cb(tmpqkv_perm, "tmpqkv", il);
8121
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8122
+ cb(Qcur, "Qcur", il);
7901
8123
 
7902
- struct ggml_tensor * tmpq = ggml_view_3d(
7903
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7904
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7905
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7906
- 0
7907
- );
7908
- cb(tmpq, "tmpq", il);
8124
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8125
+ model.layers[il].wo, NULL,
8126
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8127
+ }
7909
8128
 
7910
- struct ggml_tensor * tmpk = ggml_view_3d(
7911
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7912
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7913
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7914
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7915
- );
7916
- cb(tmpk, "tmpk", il);
7917
-
7918
- // Q/K Layernorm
7919
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7920
- model.layers[il].attn_q_norm,
7921
- model.layers[il].attn_q_norm_b,
7922
- LLM_NORM, cb, il);
7923
- cb(tmpq, "tmpq", il);
7924
-
7925
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7926
- model.layers[il].attn_k_norm,
7927
- model.layers[il].attn_k_norm_b,
7928
- LLM_NORM, cb, il);
7929
- cb(tmpk, "tmpk", il);
7930
-
7931
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7932
- struct ggml_tensor * qrot = ggml_view_3d(
7933
- ctx0, tmpq, n_rot, n_head, n_tokens,
7934
- ggml_element_size(tmpq) * n_embd_head,
7935
- ggml_element_size(tmpq) * n_embd_head * n_head,
7936
- 0
7937
- );
7938
- cb(qrot, "qrot", il);
7939
-
7940
- struct ggml_tensor * krot = ggml_view_3d(
7941
- ctx0, tmpk, n_rot, n_head, n_tokens,
7942
- ggml_element_size(tmpk) * n_embd_head,
7943
- ggml_element_size(tmpk) * n_embd_head * n_head,
7944
- 0
7945
- );
7946
- cb(krot, "krot", il);
7947
-
7948
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
7949
- struct ggml_tensor * qpass = ggml_view_3d(
7950
- ctx0, tmpq, n_rot, n_head, n_tokens,
7951
- ggml_element_size(tmpq) * n_embd_head,
7952
- ggml_element_size(tmpq) * n_embd_head * n_head,
7953
- ggml_element_size(tmpq) * n_rot
7954
- );
7955
- cb(qpass, "qpass", il);
7956
-
7957
- struct ggml_tensor * kpass = ggml_view_3d(
7958
- ctx0, tmpk, n_rot, n_head, n_tokens,
7959
- ggml_element_size(tmpk) * n_embd_head,
7960
- ggml_element_size(tmpk) * n_embd_head * n_head,
7961
- ggml_element_size(tmpk) * n_rot
7962
- );
7963
- cb(kpass, "kpass", il);
7964
-
7965
- struct ggml_tensor * qrotated = ggml_rope_custom(
7966
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7967
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7968
- );
7969
- cb(qrotated, "qrotated", il);
7970
-
7971
- struct ggml_tensor * krotated = ggml_rope_custom(
7972
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7973
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7974
- );
7975
- cb(krotated, "krotated", il);
7976
-
7977
- // ggml currently only supports concatenation on dim=2
7978
- // so we need to permute qrot, qpass, concat, then permute back.
7979
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
7980
- cb(qrotated, "qrotated", il);
7981
-
7982
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
7983
- cb(krotated, "krotated", il);
7984
-
7985
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
7986
- cb(qpass, "qpass", il);
7987
-
7988
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
7989
- cb(kpass, "kpass", il);
7990
-
7991
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
7992
- cb(Qcur, "Qcur", il);
7993
-
7994
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
7995
- cb(Kcur, "Kcur", il);
7996
-
7997
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
7998
- cb(Q, "Q", il);
7999
-
8000
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8001
- cb(Kcur, "Kcur", il);
8002
-
8003
- struct ggml_tensor * Vcur = ggml_view_3d(
8004
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8005
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8006
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8007
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8008
- );
8009
- cb(Vcur, "Vcur", il);
8010
-
8011
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8012
- model.layers[il].wo, model.layers[il].bo,
8013
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8014
- }
8015
-
8016
- if (il == n_layer - 1) {
8017
- // skip computing output for unused tokens
8018
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8019
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8020
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8021
- }
8022
-
8023
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8024
- cb(ffn_inp, "ffn_inp", il);
8025
-
8026
- // feed-forward network
8027
- {
8028
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8029
- model.layers[il].ffn_norm,
8030
- model.layers[il].ffn_norm_b,
8031
- LLM_NORM, cb, il);
8032
- cb(cur, "ffn_norm", il);
8033
-
8034
- cur = llm_build_ffn(ctx0, cur,
8035
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8036
- NULL, NULL,
8037
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8038
- NULL,
8039
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8040
- cb(cur, "ffn_out", il);
8041
- }
8042
-
8043
- cur = ggml_add(ctx0, cur, ffn_inp);
8044
- cb(cur, "l_out", il);
8045
-
8046
- inpL = cur;
8047
- }
8048
-
8049
- cur = inpL;
8050
-
8051
- cur = llm_build_norm(ctx0, cur, hparams,
8052
- model.output_norm,
8053
- model.output_norm_b,
8054
- LLM_NORM, cb, -1);
8055
- cb(cur, "result_norm", -1);
8056
-
8057
- cur = ggml_mul_mat(ctx0, model.output, cur);
8058
- cb(cur, "result_output", -1);
8059
-
8060
- ggml_build_forward_expand(gf, cur);
8061
-
8062
- return gf;
8063
- }
8064
-
8065
- struct ggml_cgraph * build_refact() {
8066
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8067
-
8068
- const int64_t n_embd_head = hparams.n_embd_head_v;
8069
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8070
-
8071
- struct ggml_tensor * cur;
8072
- struct ggml_tensor * inpL;
8073
-
8074
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8075
-
8076
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8077
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8078
-
8079
- // positions of the tokens in the KV cache
8080
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8081
-
8082
- for (int il = 0; il < n_layer; ++il) {
8083
- struct ggml_tensor * inpSA = inpL;
8084
-
8085
- cur = llm_build_norm(ctx0, inpL, hparams,
8086
- model.layers[il].attn_norm, NULL,
8087
- LLM_NORM_RMS, cb, il);
8088
- cb(cur, "attn_norm", il);
8089
-
8090
- // self-attention
8091
- {
8092
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8093
- cb(Qcur, "Qcur", il);
8094
-
8095
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8096
- cb(Kcur, "Kcur", il);
8097
-
8098
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8099
- cb(Vcur, "Vcur", il);
8100
-
8101
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8102
- cb(Kcur, "Kcur", il);
8103
-
8104
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8105
- cb(Qcur, "Qcur", il);
8106
-
8107
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8108
- model.layers[il].wo, NULL,
8109
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8110
- }
8111
-
8112
- if (il == n_layer - 1) {
8113
- // skip computing output for unused tokens
8114
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8115
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8116
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8117
- }
8129
+ if (il == n_layer - 1) {
8130
+ // skip computing output for unused tokens
8131
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8132
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8133
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8134
+ }
8118
8135
 
8119
8136
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8120
8137
  cb(ffn_inp, "ffn_inp", il);
@@ -8168,8 +8185,11 @@ struct llm_build_context {
8168
8185
 
8169
8186
  struct ggml_tensor * cur;
8170
8187
  struct ggml_tensor * inpL;
8188
+ struct ggml_tensor * inp_pos = nullptr;
8171
8189
 
8172
- struct ggml_tensor * inp_pos = build_inp_pos();
8190
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8191
+ inp_pos = build_inp_pos();
8192
+ }
8173
8193
  struct ggml_tensor * inp_mean = build_inp_mean();
8174
8194
  struct ggml_tensor * inp_cls = build_inp_cls();
8175
8195
 
@@ -8200,13 +8220,26 @@ struct llm_build_context {
8200
8220
  struct ggml_tensor * Vcur;
8201
8221
 
8202
8222
  // self-attention
8203
- if (model.arch == LLM_ARCH_BERT) {
8223
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8204
8224
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8205
8225
  cb(Qcur, "Qcur", il);
8206
8226
 
8227
+ if (model.layers[il].attn_q_norm) {
8228
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8229
+ model.layers[il].attn_q_norm,
8230
+ model.layers[il].attn_q_norm_b,
8231
+ LLM_NORM, cb, il);
8232
+ }
8233
+
8207
8234
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8208
8235
  cb(Kcur, "Kcur", il);
8209
8236
 
8237
+ if (model.layers[il].attn_k_norm) {
8238
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8239
+ model.layers[il].attn_k_norm,
8240
+ model.layers[il].attn_k_norm_b,
8241
+ LLM_NORM, cb, il);
8242
+ }
8210
8243
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8211
8244
  cb(Vcur, "Vcur", il);
8212
8245
 
@@ -8225,15 +8258,15 @@ struct llm_build_context {
8225
8258
  cb(Kcur, "Kcur", il);
8226
8259
  cb(Vcur, "Vcur", il);
8227
8260
 
8228
- Qcur = ggml_rope_custom(
8229
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8261
+ Qcur = ggml_rope_ext(
8262
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8230
8263
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8231
8264
  ext_factor, attn_factor, beta_fast, beta_slow
8232
8265
  );
8233
8266
  cb(Qcur, "Qcur", il);
8234
8267
 
8235
- Kcur = ggml_rope_custom(
8236
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8268
+ Kcur = ggml_rope_ext(
8269
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8237
8270
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8238
8271
  ext_factor, attn_factor, beta_fast, beta_slow
8239
8272
  );
@@ -8246,7 +8279,7 @@ struct llm_build_context {
8246
8279
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8247
8280
  cb(kq, "kq", il);
8248
8281
 
8249
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8282
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8250
8283
  cb(kq, "kq_soft_max_ext", il);
8251
8284
 
8252
8285
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8297,6 +8330,13 @@ struct llm_build_context {
8297
8330
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8298
8331
  NULL,
8299
8332
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8333
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8334
+ cur = llm_build_ffn(ctx0, cur,
8335
+ model.layers[il].ffn_up, NULL,
8336
+ model.layers[il].ffn_gate, NULL,
8337
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8338
+ NULL,
8339
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8300
8340
  } else {
8301
8341
  cur = llm_build_ffn(ctx0, cur,
8302
8342
  model.layers[il].ffn_up, NULL,
@@ -8363,9 +8403,6 @@ struct llm_build_context {
8363
8403
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8364
8404
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8365
8405
 
8366
- // positions of the tokens in the KV cache
8367
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8368
-
8369
8406
  inpL = llm_build_norm(ctx0, inpL, hparams,
8370
8407
  model.tok_norm,
8371
8408
  model.tok_norm_b,
@@ -8399,7 +8436,7 @@ struct llm_build_context {
8399
8436
 
8400
8437
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8401
8438
  model.layers[il].wo, model.layers[il].bo,
8402
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8439
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8403
8440
  }
8404
8441
 
8405
8442
  if (il == n_layer - 1) {
@@ -8464,9 +8501,6 @@ struct llm_build_context {
8464
8501
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8465
8502
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8466
8503
 
8467
- // positions of the tokens in the KV cache
8468
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8469
-
8470
8504
  if (model.pos_embd) {
8471
8505
  // inp_pos - contains the positions
8472
8506
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8530,13 +8564,13 @@ struct llm_build_context {
8530
8564
 
8531
8565
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
8566
  model.layers[il].wo, model.layers[il].bo,
8533
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8567
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8534
8568
  } else {
8535
8569
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8536
8570
 
8537
8571
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8538
8572
  model.layers[il].wo, model.layers[il].bo,
8539
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8573
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8540
8574
  }
8541
8575
  }
8542
8576
 
@@ -8664,15 +8698,15 @@ struct llm_build_context {
8664
8698
  }
8665
8699
 
8666
8700
 
8667
- Qcur = ggml_rope_custom(
8668
- ctx0, Qcur, inp_pos,
8701
+ Qcur = ggml_rope_ext(
8702
+ ctx0, Qcur, inp_pos, nullptr,
8669
8703
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8670
8704
  ext_factor, attn_factor, beta_fast, beta_slow
8671
8705
  );
8672
8706
  cb(Qcur, "Qcur", il);
8673
8707
 
8674
- Kcur = ggml_rope_custom(
8675
- ctx0, Kcur, inp_pos,
8708
+ Kcur = ggml_rope_ext(
8709
+ ctx0, Kcur, inp_pos, nullptr,
8676
8710
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8677
8711
  ext_factor, attn_factor, beta_fast, beta_slow
8678
8712
  );
@@ -8680,7 +8714,7 @@ struct llm_build_context {
8680
8714
 
8681
8715
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8682
8716
  model.layers[il].wo, NULL,
8683
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8717
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8684
8718
  }
8685
8719
 
8686
8720
  if (il == n_layer - 1) {
@@ -8784,21 +8818,21 @@ struct llm_build_context {
8784
8818
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8785
8819
 
8786
8820
  // using mode = 2 for neox mode
8787
- Qcur = ggml_rope_custom(
8788
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8821
+ Qcur = ggml_rope_ext(
8822
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8789
8823
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8790
8824
  );
8791
8825
  cb(Qcur, "Qcur", il);
8792
8826
 
8793
- Kcur = ggml_rope_custom(
8794
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8827
+ Kcur = ggml_rope_ext(
8828
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8795
8829
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8796
8830
  );
8797
8831
  cb(Kcur, "Kcur", il);
8798
8832
 
8799
8833
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8800
8834
  model.layers[il].wo, NULL,
8801
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8835
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8802
8836
  }
8803
8837
 
8804
8838
  if (il == n_layer - 1) {
@@ -8895,15 +8929,15 @@ struct llm_build_context {
8895
8929
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8896
8930
  cb(Vcur, "Vcur", il);
8897
8931
 
8898
- Qcur = ggml_rope_custom(
8899
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8932
+ Qcur = ggml_rope_ext(
8933
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8900
8934
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8901
8935
  ext_factor, attn_factor, beta_fast, beta_slow
8902
8936
  );
8903
8937
  cb(Qcur, "Qcur", il);
8904
8938
 
8905
- Kcur = ggml_rope_custom(
8906
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8939
+ Kcur = ggml_rope_ext(
8940
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8907
8941
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8908
8942
  ext_factor, attn_factor, beta_fast, beta_slow
8909
8943
  );
@@ -8911,7 +8945,7 @@ struct llm_build_context {
8911
8945
 
8912
8946
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8913
8947
  model.layers[il].wo, model.layers[il].bo,
8914
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8948
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8915
8949
  }
8916
8950
 
8917
8951
  if (il == n_layer - 1) {
@@ -9009,15 +9043,15 @@ struct llm_build_context {
9009
9043
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9010
9044
  cb(Vcur, "Vcur", il);
9011
9045
 
9012
- Qcur = ggml_rope_custom(
9013
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9046
+ Qcur = ggml_rope_ext(
9047
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9014
9048
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9015
9049
  ext_factor, attn_factor, beta_fast, beta_slow
9016
9050
  );
9017
9051
  cb(Qcur, "Qcur", il);
9018
9052
 
9019
- Kcur = ggml_rope_custom(
9020
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9053
+ Kcur = ggml_rope_ext(
9054
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9021
9055
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9022
9056
  ext_factor, attn_factor, beta_fast, beta_slow
9023
9057
  );
@@ -9025,7 +9059,7 @@ struct llm_build_context {
9025
9059
 
9026
9060
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9027
9061
  model.layers[il].wo, model.layers[il].bo,
9028
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9062
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9029
9063
  }
9030
9064
 
9031
9065
  if (il == n_layer - 1) {
@@ -9161,8 +9195,8 @@ struct llm_build_context {
9161
9195
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9162
9196
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9163
9197
 
9164
- Qcur = ggml_rope_custom(
9165
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9198
+ Qcur = ggml_rope_ext(
9199
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9166
9200
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9167
9201
  );
9168
9202
  cb(Qcur, "Qcur", il);
@@ -9172,15 +9206,15 @@ struct llm_build_context {
9172
9206
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9173
9207
  cb(Qcur, "Qcur", il);
9174
9208
 
9175
- Kcur = ggml_rope_custom(
9176
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9209
+ Kcur = ggml_rope_ext(
9210
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9177
9211
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9178
9212
  );
9179
9213
  cb(Kcur, "Kcur", il);
9180
9214
 
9181
9215
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9182
9216
  model.layers[il].wo, model.layers[il].bo,
9183
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9217
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9184
9218
  }
9185
9219
 
9186
9220
  if (il == n_layer - 1) {
@@ -9249,6 +9283,9 @@ struct llm_build_context {
9249
9283
 
9250
9284
  // self-attention
9251
9285
  {
9286
+ // rope freq factors for 128k context
9287
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9288
+
9252
9289
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9253
9290
  model.layers[il].attn_norm,
9254
9291
  NULL,
@@ -9280,8 +9317,8 @@ struct llm_build_context {
9280
9317
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9281
9318
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9282
9319
 
9283
- Qcur = ggml_rope_custom(
9284
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9320
+ Qcur = ggml_rope_ext(
9321
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9285
9322
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9286
9323
  );
9287
9324
  cb(Qcur, "Qcur", il);
@@ -9289,15 +9326,15 @@ struct llm_build_context {
9289
9326
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9290
9327
  cb(Qcur, "Qcur", il);
9291
9328
 
9292
- Kcur = ggml_rope_custom(
9293
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9329
+ Kcur = ggml_rope_ext(
9330
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9294
9331
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9295
9332
  );
9296
9333
  cb(Kcur, "Kcur", il);
9297
9334
 
9298
9335
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
9336
  model.layers[il].wo, model.layers[il].bo,
9300
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9337
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
9338
  }
9302
9339
 
9303
9340
  if (il == n_layer - 1) {
@@ -9396,21 +9433,21 @@ struct llm_build_context {
9396
9433
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9397
9434
  cb(Vcur, "Vcur", il);
9398
9435
 
9399
- Qcur = ggml_rope_custom(
9400
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9436
+ Qcur = ggml_rope_ext(
9437
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9401
9438
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9402
9439
  ext_factor, attn_factor, beta_fast, beta_slow);
9403
9440
  cb(Qcur, "Qcur", il);
9404
9441
 
9405
- Kcur = ggml_rope_custom(
9406
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9442
+ Kcur = ggml_rope_ext(
9443
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9407
9444
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9408
9445
  ext_factor, attn_factor, beta_fast, beta_slow);
9409
9446
  cb(Kcur, "Kcur", il);
9410
9447
 
9411
9448
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9412
9449
  model.layers[il].wo, NULL,
9413
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9450
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9414
9451
  }
9415
9452
  struct ggml_tensor * sa_out = cur;
9416
9453
 
@@ -9513,7 +9550,7 @@ struct llm_build_context {
9513
9550
 
9514
9551
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9515
9552
  model.layers[il].wo, model.layers[il].bo,
9516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9553
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9517
9554
  }
9518
9555
 
9519
9556
  if (il == n_layer - 1) {
@@ -9604,15 +9641,15 @@ struct llm_build_context {
9604
9641
  cb(tmpk, "tmpk", il);
9605
9642
  cb(Vcur, "Vcur", il);
9606
9643
 
9607
- struct ggml_tensor * Qcur = ggml_rope_custom(
9608
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9644
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9645
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9609
9646
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9610
9647
  ext_factor, attn_factor, beta_fast, beta_slow
9611
9648
  );
9612
9649
  cb(Qcur, "Qcur", il);
9613
9650
 
9614
- struct ggml_tensor * Kcur = ggml_rope_custom(
9615
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9651
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9652
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9616
9653
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9617
9654
  ext_factor, attn_factor, beta_fast, beta_slow
9618
9655
  );
@@ -9620,7 +9657,7 @@ struct llm_build_context {
9620
9657
 
9621
9658
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9622
9659
  model.layers[il].wo, model.layers[il].bo,
9623
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9660
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9624
9661
  }
9625
9662
 
9626
9663
  if (il == n_layer - 1) {
@@ -9720,15 +9757,15 @@ struct llm_build_context {
9720
9757
  // cb(Vcur, "Vcur", il);
9721
9758
  // }
9722
9759
 
9723
- Qcur = ggml_rope_custom(
9724
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9760
+ Qcur = ggml_rope_ext(
9761
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9725
9762
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9726
9763
  ext_factor, attn_factor, beta_fast, beta_slow
9727
9764
  );
9728
9765
  cb(Qcur, "Qcur", il);
9729
9766
 
9730
- Kcur = ggml_rope_custom(
9731
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9767
+ Kcur = ggml_rope_ext(
9768
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9732
9769
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9733
9770
  ext_factor, attn_factor, beta_fast, beta_slow
9734
9771
  );
@@ -9736,7 +9773,7 @@ struct llm_build_context {
9736
9773
 
9737
9774
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9738
9775
  model.layers[il].wo, NULL,
9739
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9776
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9740
9777
  }
9741
9778
 
9742
9779
  if (il == n_layer - 1) {
@@ -9837,15 +9874,15 @@ struct llm_build_context {
9837
9874
  cb(Vcur, "Vcur", il);
9838
9875
  }
9839
9876
 
9840
- Qcur = ggml_rope_custom(
9841
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9877
+ Qcur = ggml_rope_ext(
9878
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9842
9879
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9843
9880
  ext_factor, attn_factor, beta_fast, beta_slow
9844
9881
  );
9845
9882
  cb(Qcur, "Qcur", il);
9846
9883
 
9847
- Kcur = ggml_rope_custom(
9848
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9884
+ Kcur = ggml_rope_ext(
9885
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9849
9886
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9850
9887
  ext_factor, attn_factor, beta_fast, beta_slow
9851
9888
  );
@@ -9853,7 +9890,7 @@ struct llm_build_context {
9853
9890
 
9854
9891
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9855
9892
  model.layers[il].wo, model.layers[il].bo,
9856
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9893
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9857
9894
  }
9858
9895
 
9859
9896
  if (il == n_layer - 1) {
@@ -9967,15 +10004,15 @@ struct llm_build_context {
9967
10004
  cb(Vcur, "Vcur", il);
9968
10005
  }
9969
10006
 
9970
- Qcur = ggml_rope_custom(
9971
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10007
+ Qcur = ggml_rope_ext(
10008
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9972
10009
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9973
10010
  ext_factor, attn_factor, beta_fast, beta_slow
9974
10011
  );
9975
10012
  cb(Qcur, "Qcur", il);
9976
10013
 
9977
- Kcur = ggml_rope_custom(
9978
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10014
+ Kcur = ggml_rope_ext(
10015
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9979
10016
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9980
10017
  ext_factor, attn_factor, beta_fast, beta_slow
9981
10018
  );
@@ -9983,7 +10020,7 @@ struct llm_build_context {
9983
10020
 
9984
10021
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9985
10022
  model.layers[il].wo, model.layers[il].bo,
9986
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10023
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9987
10024
  }
9988
10025
 
9989
10026
  if (il == n_layer - 1) {
@@ -10087,8 +10124,8 @@ struct llm_build_context {
10087
10124
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10088
10125
  cb(Vcur, "Vcur", il);
10089
10126
 
10090
- Qcur = ggml_rope_custom(
10091
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10127
+ Qcur = ggml_rope_ext(
10128
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10092
10129
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10093
10130
  ext_factor, attn_factor, beta_fast, beta_slow);
10094
10131
  cb(Qcur, "Qcur", il);
@@ -10096,15 +10133,15 @@ struct llm_build_context {
10096
10133
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10097
10134
  cb(Qcur, "Qcur_scaled", il);
10098
10135
 
10099
- Kcur = ggml_rope_custom(
10100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10136
+ Kcur = ggml_rope_ext(
10137
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10101
10138
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10102
10139
  ext_factor, attn_factor, beta_fast, beta_slow);
10103
10140
  cb(Kcur, "Kcur", il);
10104
10141
 
10105
10142
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10106
10143
  model.layers[il].wo, NULL,
10107
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10144
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10108
10145
  }
10109
10146
 
10110
10147
  if (il == n_layer - 1) {
@@ -10207,15 +10244,15 @@ struct llm_build_context {
10207
10244
  cb(Vcur, "Vcur", il);
10208
10245
  }
10209
10246
 
10210
- Qcur = ggml_rope_custom(
10211
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10247
+ Qcur = ggml_rope_ext(
10248
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10212
10249
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10213
10250
  ext_factor, attn_factor, beta_fast, beta_slow
10214
10251
  );
10215
10252
  cb(Qcur, "Qcur", il);
10216
10253
 
10217
- Kcur = ggml_rope_custom(
10218
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10254
+ Kcur = ggml_rope_ext(
10255
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10219
10256
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10220
10257
  ext_factor, attn_factor, beta_fast, beta_slow
10221
10258
  );
@@ -10223,7 +10260,7 @@ struct llm_build_context {
10223
10260
 
10224
10261
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10225
10262
  model.layers[il].wo, model.layers[il].bo,
10226
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10263
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10227
10264
  }
10228
10265
 
10229
10266
  if (il == n_layer - 1) {
@@ -10490,22 +10527,267 @@ struct llm_build_context {
10490
10527
  LLM_NORM, cb, il);
10491
10528
  cb(Qcur, "Qcur", il);
10492
10529
 
10493
- Kcur = llm_build_norm(ctx0, Kcur, hparams,
10494
- model.layers[il].attn_k_norm,
10495
- NULL,
10496
- LLM_NORM, cb, il);
10497
- cb(Kcur, "Kcur", il);
10498
- }
10530
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
10531
+ model.layers[il].attn_k_norm,
10532
+ NULL,
10533
+ LLM_NORM, cb, il);
10534
+ cb(Kcur, "Kcur", il);
10535
+ }
10536
+
10537
+ Qcur = ggml_rope_ext(
10538
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10539
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10540
+ ext_factor, attn_factor, beta_fast, beta_slow
10541
+ );
10542
+ cb(Qcur, "Qcur", il);
10543
+
10544
+ Kcur = ggml_rope_ext(
10545
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10546
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10547
+ ext_factor, attn_factor, beta_fast, beta_slow
10548
+ );
10549
+ cb(Kcur, "Kcur", il);
10550
+
10551
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10552
+ model.layers[il].wo, model.layers[il].bo,
10553
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10554
+ }
10555
+
10556
+ if (il == n_layer - 1) {
10557
+ // skip computing output for unused tokens
10558
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10559
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10560
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10561
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
10562
+ }
10563
+
10564
+ struct ggml_tensor * attn_out = cur;
10565
+
10566
+ // feed-forward network
10567
+ {
10568
+ cur = llm_build_ffn(ctx0, ffn_inp,
10569
+ model.layers[il].ffn_up, NULL,
10570
+ model.layers[il].ffn_gate, NULL,
10571
+ model.layers[il].ffn_down, NULL,
10572
+ NULL,
10573
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10574
+ cb(cur, "ffn_out", il);
10575
+ }
10576
+
10577
+ // add together residual + FFN + self-attention
10578
+ cur = ggml_add(ctx0, cur, inpL);
10579
+ cur = ggml_add(ctx0, cur, attn_out);
10580
+ cb(cur, "l_out", il);
10581
+
10582
+ // input for next layer
10583
+ inpL = cur;
10584
+ }
10585
+
10586
+ cur = inpL;
10587
+
10588
+ cur = llm_build_norm(ctx0, cur, hparams,
10589
+ model.output_norm, NULL,
10590
+ LLM_NORM, cb, -1);
10591
+ cb(cur, "result_norm", -1);
10592
+
10593
+ // lm_head
10594
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10595
+
10596
+ if (f_logit_scale) {
10597
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
10598
+ }
10599
+
10600
+ cb(cur, "result_output", -1);
10601
+
10602
+ ggml_build_forward_expand(gf, cur);
10603
+
10604
+ return gf;
10605
+
10606
+ }
10607
+
10608
+ // ref: https://allenai.org/olmo
10609
+ // based on the original build_llama() function, changes:
10610
+ // * non-parametric layer norm
10611
+ // * clamp qkv
10612
+ // * removed bias
10613
+ // * removed MoE
10614
+ struct ggml_cgraph * build_olmo() {
10615
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10616
+
10617
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10618
+ int32_t n_tokens = this->n_tokens;
10619
+
10620
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10621
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10622
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10623
+
10624
+ struct ggml_tensor * cur;
10625
+ struct ggml_tensor * inpL;
10626
+
10627
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10628
+
10629
+ // inp_pos - contains the positions
10630
+ struct ggml_tensor * inp_pos = build_inp_pos();
10631
+
10632
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10633
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10634
+
10635
+ for (int il = 0; il < n_layer; ++il) {
10636
+ struct ggml_tensor * inpSA = inpL;
10637
+
10638
+ // norm
10639
+ cur = llm_build_norm(ctx0, inpL, hparams,
10640
+ NULL, NULL,
10641
+ LLM_NORM, cb, il);
10642
+ cb(cur, "attn_norm", il);
10643
+
10644
+ // self-attention
10645
+ {
10646
+ // compute Q and K and RoPE them
10647
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10648
+ cb(Qcur, "Qcur", il);
10649
+ if (hparams.f_clamp_kqv > 0.0f) {
10650
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10651
+ cb(Qcur, "Qcur", il);
10652
+ }
10653
+
10654
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10655
+ cb(Kcur, "Kcur", il);
10656
+ if (hparams.f_clamp_kqv > 0.0f) {
10657
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10658
+ cb(Kcur, "Kcur", il);
10659
+ }
10660
+
10661
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10662
+ cb(Vcur, "Vcur", il);
10663
+ if (hparams.f_clamp_kqv > 0.0f) {
10664
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10665
+ cb(Vcur, "Vcur", il);
10666
+ }
10667
+
10668
+ Qcur = ggml_rope_ext(
10669
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10670
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10671
+ ext_factor, attn_factor, beta_fast, beta_slow
10672
+ );
10673
+ cb(Qcur, "Qcur", il);
10674
+
10675
+ Kcur = ggml_rope_ext(
10676
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10677
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10678
+ ext_factor, attn_factor, beta_fast, beta_slow
10679
+ );
10680
+ cb(Kcur, "Kcur", il);
10681
+
10682
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10683
+ model.layers[il].wo, nullptr,
10684
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10685
+ }
10686
+
10687
+ if (il == n_layer - 1) {
10688
+ // skip computing output for unused tokens
10689
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10690
+ n_tokens = n_outputs;
10691
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10692
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10693
+ }
10694
+
10695
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10696
+ cb(ffn_inp, "ffn_inp", il);
10697
+
10698
+ // feed-forward network
10699
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10700
+ NULL, NULL,
10701
+ LLM_NORM, cb, il);
10702
+ cb(cur, "ffn_norm", il);
10703
+
10704
+ cur = llm_build_ffn(ctx0, cur,
10705
+ model.layers[il].ffn_up, NULL,
10706
+ model.layers[il].ffn_gate, NULL,
10707
+ model.layers[il].ffn_down, NULL,
10708
+ NULL,
10709
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10710
+ cb(cur, "ffn_out", il);
10711
+
10712
+ cur = ggml_add(ctx0, cur, ffn_inp);
10713
+ cb(cur, "ffn_out", il);
10714
+
10715
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10716
+ if (layer_dir != nullptr) {
10717
+ cur = ggml_add(ctx0, cur, layer_dir);
10718
+ }
10719
+ cb(cur, "l_out", il);
10720
+
10721
+ // input for next layer
10722
+ inpL = cur;
10723
+ }
10724
+
10725
+ cur = inpL;
10726
+
10727
+ cur = llm_build_norm(ctx0, cur, hparams,
10728
+ NULL, NULL,
10729
+ LLM_NORM, cb, -1);
10730
+ cb(cur, "result_norm", -1);
10731
+
10732
+ // lm_head
10733
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10734
+ cb(cur, "result_output", -1);
10735
+
10736
+ ggml_build_forward_expand(gf, cur);
10737
+
10738
+ return gf;
10739
+ }
10740
+
10741
+ struct ggml_cgraph * build_gptneox() {
10742
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10743
+
10744
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10745
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10746
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10747
+
10748
+ struct ggml_tensor * cur;
10749
+ struct ggml_tensor * inpL;
10750
+
10751
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10752
+
10753
+ // inp_pos - contains the positions
10754
+ struct ggml_tensor * inp_pos = build_inp_pos();
10755
+
10756
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10757
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10758
+
10759
+ for (int il = 0; il < n_layer; ++il) {
10760
+ cur = llm_build_norm(ctx0, inpL, hparams,
10761
+ model.layers[il].attn_norm,
10762
+ model.layers[il].attn_norm_b,
10763
+ LLM_NORM, cb, il);
10764
+ cb(cur, "attn_norm", il);
10765
+
10766
+ // self-attention
10767
+ {
10768
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10769
+ cb(cur, "wqkv", il);
10770
+
10771
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10772
+ cb(cur, "bqkv", il);
10773
+
10774
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10775
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10776
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10777
+
10778
+ cb(Qcur, "Qcur", il);
10779
+ cb(Kcur, "Kcur", il);
10780
+ cb(Vcur, "Vcur", il);
10499
10781
 
10500
- Qcur = ggml_rope_custom(
10501
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10782
+ Qcur = ggml_rope_ext(
10783
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10502
10784
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10503
10785
  ext_factor, attn_factor, beta_fast, beta_slow
10504
10786
  );
10505
10787
  cb(Qcur, "Qcur", il);
10506
10788
 
10507
- Kcur = ggml_rope_custom(
10508
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10789
+ Kcur = ggml_rope_ext(
10790
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10509
10791
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10510
10792
  ext_factor, attn_factor, beta_fast, beta_slow
10511
10793
  );
@@ -10513,68 +10795,84 @@ struct llm_build_context {
10513
10795
 
10514
10796
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10515
10797
  model.layers[il].wo, model.layers[il].bo,
10516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10798
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10517
10799
  }
10518
10800
 
10519
10801
  if (il == n_layer - 1) {
10520
10802
  // skip computing output for unused tokens
10521
10803
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10522
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10523
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10524
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
10804
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10805
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10525
10806
  }
10526
10807
 
10527
- struct ggml_tensor * attn_out = cur;
10808
+ // ffn
10809
+ if (hparams.use_par_res) {
10810
+ // attention and ffn are computed in parallel
10811
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10528
10812
 
10529
- // feed-forward network
10530
- {
10531
- cur = llm_build_ffn(ctx0, ffn_inp,
10532
- model.layers[il].ffn_up, NULL,
10533
- model.layers[il].ffn_gate, NULL,
10534
- model.layers[il].ffn_down, NULL,
10813
+ struct ggml_tensor * attn_out = cur;
10814
+
10815
+ cur = llm_build_norm(ctx0, inpL, hparams,
10816
+ model.layers[il].ffn_norm,
10817
+ model.layers[il].ffn_norm_b,
10818
+ LLM_NORM, cb, il);
10819
+ cb(cur, "ffn_norm", il);
10820
+
10821
+ cur = llm_build_ffn(ctx0, cur,
10822
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10823
+ NULL, NULL,
10824
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10535
10825
  NULL,
10536
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10826
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10537
10827
  cb(cur, "ffn_out", il);
10538
- }
10539
10828
 
10540
- // add together residual + FFN + self-attention
10541
- cur = ggml_add(ctx0, cur, inpL);
10542
- cur = ggml_add(ctx0, cur, attn_out);
10543
- cb(cur, "l_out", il);
10829
+ cur = ggml_add(ctx0, cur, inpL);
10830
+ cb(cur, "ffn_out", il);
10544
10831
 
10545
- // input for next layer
10546
- inpL = cur;
10547
- }
10832
+ inpL = ggml_add(ctx0, cur, attn_out);
10833
+ cb(inpL, "l_out", il);
10834
+ } else {
10835
+ // attention and ffn are computed sequentially
10836
+ // x = x + attn(ln1(x))
10837
+ // x = x + ffn(ln2(x))
10548
10838
 
10549
- cur = inpL;
10839
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10840
+ cb(ffn_inp, "ffn_inp", il);
10550
10841
 
10551
- cur = llm_build_norm(ctx0, cur, hparams,
10552
- model.output_norm, NULL,
10553
- LLM_NORM, cb, -1);
10554
- cb(cur, "result_norm", -1);
10842
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10843
+ model.layers[il].ffn_norm,
10844
+ model.layers[il].ffn_norm_b,
10845
+ LLM_NORM, cb, il);
10846
+ cb(cur, "ffn_norm", il);
10555
10847
 
10556
- // lm_head
10557
- cur = ggml_mul_mat(ctx0, model.output, cur);
10848
+ cur = llm_build_ffn(ctx0, cur,
10849
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10850
+ NULL, NULL,
10851
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10852
+ NULL,
10853
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10854
+ cb(cur, "ffn_out", il);
10558
10855
 
10559
- if (f_logit_scale) {
10560
- cur = ggml_scale(ctx0, cur, f_logit_scale);
10856
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10857
+ cb(inpL, "l_out", il);
10858
+ }
10561
10859
  }
10562
10860
 
10861
+ cur = llm_build_norm(ctx0, inpL, hparams,
10862
+ model.output_norm,
10863
+ model.output_norm_b,
10864
+ LLM_NORM, cb, -1);
10865
+ cb(cur, "result_norm", -1);
10866
+
10867
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10563
10868
  cb(cur, "result_output", -1);
10564
10869
 
10565
10870
  ggml_build_forward_expand(gf, cur);
10566
10871
 
10567
10872
  return gf;
10568
-
10569
10873
  }
10570
10874
 
10571
- // ref: https://allenai.org/olmo
10572
- // based on the original build_llama() function, changes:
10573
- // * non-parametric layer norm
10574
- // * clamp qkv
10575
- // * removed bias
10576
- // * removed MoE
10577
- struct ggml_cgraph * build_olmo() {
10875
+ struct ggml_cgraph * build_arctic() {
10578
10876
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10579
10877
 
10580
10878
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -10600,8 +10898,8 @@ struct llm_build_context {
10600
10898
 
10601
10899
  // norm
10602
10900
  cur = llm_build_norm(ctx0, inpL, hparams,
10603
- NULL, NULL,
10604
- LLM_NORM, cb, il);
10901
+ model.layers[il].attn_norm, NULL,
10902
+ LLM_NORM_RMS, cb, il);
10605
10903
  cb(cur, "attn_norm", il);
10606
10904
 
10607
10905
  // self-attention
@@ -10609,42 +10907,30 @@ struct llm_build_context {
10609
10907
  // compute Q and K and RoPE them
10610
10908
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10611
10909
  cb(Qcur, "Qcur", il);
10612
- if (hparams.f_clamp_kqv > 0.0f) {
10613
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10614
- cb(Qcur, "Qcur", il);
10615
- }
10616
10910
 
10617
10911
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10618
10912
  cb(Kcur, "Kcur", il);
10619
- if (hparams.f_clamp_kqv > 0.0f) {
10620
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10621
- cb(Kcur, "Kcur", il);
10622
- }
10623
10913
 
10624
10914
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10625
10915
  cb(Vcur, "Vcur", il);
10626
- if (hparams.f_clamp_kqv > 0.0f) {
10627
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10628
- cb(Vcur, "Vcur", il);
10629
- }
10630
10916
 
10631
- Qcur = ggml_rope_custom(
10632
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10917
+ Qcur = ggml_rope_ext(
10918
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10633
10919
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10634
10920
  ext_factor, attn_factor, beta_fast, beta_slow
10635
10921
  );
10636
10922
  cb(Qcur, "Qcur", il);
10637
10923
 
10638
- Kcur = ggml_rope_custom(
10639
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10924
+ Kcur = ggml_rope_ext(
10925
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10640
10926
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10641
10927
  ext_factor, attn_factor, beta_fast, beta_slow
10642
10928
  );
10643
10929
  cb(Kcur, "Kcur", il);
10644
10930
 
10645
10931
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10646
- model.layers[il].wo, nullptr,
10647
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10932
+ model.layers[il].wo, NULL,
10933
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10648
10934
  }
10649
10935
 
10650
10936
  if (il == n_layer - 1) {
@@ -10660,8 +10946,8 @@ struct llm_build_context {
10660
10946
 
10661
10947
  // feed-forward network
10662
10948
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
10663
- NULL, NULL,
10664
- LLM_NORM, cb, il);
10949
+ model.layers[il].ffn_norm, NULL,
10950
+ LLM_NORM_RMS, cb, il);
10665
10951
  cb(cur, "ffn_norm", il);
10666
10952
 
10667
10953
  cur = llm_build_ffn(ctx0, cur,
@@ -10672,7 +10958,26 @@ struct llm_build_context {
10672
10958
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10673
10959
  cb(cur, "ffn_out", il);
10674
10960
 
10675
- cur = ggml_add(ctx0, cur, ffn_inp);
10961
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
10962
+ cb(ffn_out, "ffn_out", il);
10963
+
10964
+ // MoE
10965
+ cur = llm_build_norm(ctx0, inpSA, hparams,
10966
+ model.layers[il].ffn_norm_exps, NULL,
10967
+ LLM_NORM_RMS, cb, il);
10968
+ cb(cur, "ffn_norm_exps", il);
10969
+
10970
+ cur = llm_build_moe_ffn(ctx0, cur,
10971
+ model.layers[il].ffn_gate_inp,
10972
+ model.layers[il].ffn_up_exps,
10973
+ model.layers[il].ffn_gate_exps,
10974
+ model.layers[il].ffn_down_exps,
10975
+ n_expert, n_expert_used,
10976
+ LLM_FFN_SILU, true,
10977
+ cb, il);
10978
+ cb(cur, "ffn_moe_out", il);
10979
+
10980
+ cur = ggml_add(ctx0, cur, ffn_out);
10676
10981
  cb(cur, "ffn_out", il);
10677
10982
 
10678
10983
  ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
@@ -10688,8 +10993,8 @@ struct llm_build_context {
10688
10993
  cur = inpL;
10689
10994
 
10690
10995
  cur = llm_build_norm(ctx0, cur, hparams,
10691
- NULL, NULL,
10692
- LLM_NORM, cb, -1);
10996
+ model.output_norm, NULL,
10997
+ LLM_NORM_RMS, cb, -1);
10693
10998
  cb(cur, "result_norm", -1);
10694
10999
 
10695
11000
  // lm_head
@@ -10816,15 +11121,12 @@ static struct ggml_cgraph * llama_build_graph(
10816
11121
  {
10817
11122
  result = llm.build_starcoder();
10818
11123
  } break;
10819
- case LLM_ARCH_PERSIMMON:
10820
- {
10821
- result = llm.build_persimmon();
10822
- } break;
10823
11124
  case LLM_ARCH_REFACT:
10824
11125
  {
10825
11126
  result = llm.build_refact();
10826
11127
  } break;
10827
11128
  case LLM_ARCH_BERT:
11129
+ case LLM_ARCH_JINA_BERT_V2:
10828
11130
  case LLM_ARCH_NOMIC_BERT:
10829
11131
  {
10830
11132
  result = llm.build_bert();
@@ -10913,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
10913
11215
  {
10914
11216
  result = llm.build_olmo();
10915
11217
  } break;
11218
+ case LLM_ARCH_GPTNEOX:
11219
+ {
11220
+ result = llm.build_gptneox();
11221
+ } break;
11222
+ case LLM_ARCH_ARCTIC:
11223
+ {
11224
+ result = llm.build_arctic();
11225
+ } break;
10916
11226
  default:
10917
11227
  GGML_ASSERT(false);
10918
11228
  }
@@ -11032,11 +11342,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11032
11342
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11033
11343
  f = -INFINITY;
11034
11344
  } else {
11035
- f = 0.0f;
11345
+ if (hparams.use_alibi) {
11346
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11347
+ } else {
11348
+ f = 0.0f;
11349
+ }
11036
11350
  }
11037
11351
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11038
11352
  }
11039
11353
  }
11354
+
11355
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11356
+ for (int j = 0; j < n_kv; ++j) {
11357
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11358
+ }
11359
+ }
11040
11360
  }
11041
11361
  } else {
11042
11362
  // when using kv cache, the mask needs to match the kv cache size
@@ -11055,7 +11375,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11055
11375
  float f = -INFINITY;
11056
11376
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11057
11377
  if (batch.seq_id[i][s] == seq_id) {
11058
- f = 0.0f;
11378
+ if (hparams.use_alibi) {
11379
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11380
+ } else {
11381
+ f = 0.0f;
11382
+ }
11059
11383
  break;
11060
11384
  }
11061
11385
  }
@@ -11071,21 +11395,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11071
11395
  }
11072
11396
  }
11073
11397
 
11074
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
- // this allows to process multiple sequences in parallel with ALiBi-based models
11076
- if (hparams.use_alibi) {
11077
- const int64_t n_kv = kv_self.n;
11078
-
11079
- GGML_ASSERT(lctx.inp_KQ_pos);
11080
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11081
-
11082
- float * data = (float *) lctx.inp_KQ_pos->data;
11083
-
11084
- for (int i = 0; i < n_kv; ++i) {
11085
- data[i] = float(lctx.kv_self.cells[i].pos);
11086
- }
11087
- }
11088
-
11089
11398
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11090
11399
  const int64_t n_tokens = batch.n_tokens;
11091
11400
 
@@ -11259,11 +11568,6 @@ static void llama_graph_compute(
11259
11568
  llama_context & lctx,
11260
11569
  ggml_cgraph * gf,
11261
11570
  int n_threads) {
11262
- #ifdef GGML_USE_MPI
11263
- const int64_t n_layer = lctx.model.hparams.n_layer;
11264
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11265
- #endif
11266
-
11267
11571
  #ifdef GGML_USE_METAL
11268
11572
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11269
11573
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11278,10 +11582,6 @@ static void llama_graph_compute(
11278
11582
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11279
11583
 
11280
11584
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11281
-
11282
- #ifdef GGML_USE_MPI
11283
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11284
- #endif
11285
11585
  }
11286
11586
 
11287
11587
  // decode a batch of tokens by evaluating the transformer
@@ -11319,12 +11619,6 @@ static int llama_decode_internal(
11319
11619
  }
11320
11620
  lctx.n_queued_tokens += n_tokens_all;
11321
11621
 
11322
- #ifdef GGML_USE_MPI
11323
- // TODO: needs fix after #3228
11324
- GGML_ASSERT(false && "not implemented");
11325
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11326
- #endif
11327
-
11328
11622
  auto & kv_self = lctx.kv_self;
11329
11623
 
11330
11624
  const int64_t n_embd = hparams.n_embd;
@@ -11455,7 +11749,8 @@ static int llama_decode_internal(
11455
11749
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11456
11750
  // after enough generations, the benefit from this heuristic disappears
11457
11751
  // if we start defragmenting the cache, the benefit from this will be more important
11458
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11752
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11753
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11459
11754
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11460
11755
  }
11461
11756
  }
@@ -12200,13 +12495,14 @@ struct llm_tokenizer_bpe {
12200
12495
 
12201
12496
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12202
12497
  int final_prev_index = -1;
12498
+ bool ignore_merges = false;
12203
12499
 
12204
12500
  std::vector<std::string> word_collection;
12205
12501
  switch (vocab.type) {
12206
12502
  case LLAMA_VOCAB_TYPE_BPE:
12207
12503
  switch (vocab.type_pre) {
12208
12504
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12505
+ ignore_merges = true;
12210
12506
  word_collection = unicode_regex_split(text, {
12211
12507
  // original regex from tokenizer.json
12212
12508
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12215,6 +12511,12 @@ struct llm_tokenizer_bpe {
12215
12511
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
12512
  });
12217
12513
  break;
12514
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12515
+ word_collection = unicode_regex_split(text, {
12516
+ // same as llama3
12517
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12518
+ });
12519
+ break;
12218
12520
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
12521
  word_collection = unicode_regex_split(text, {
12220
12522
  "[\r\n]",
@@ -12266,6 +12568,7 @@ struct llm_tokenizer_bpe {
12266
12568
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12267
12569
  });
12268
12570
  break;
12571
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12269
12572
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
12573
  word_collection = unicode_regex_split(text, {
12271
12574
  // original regex from tokenizer.json
@@ -12298,6 +12601,11 @@ struct llm_tokenizer_bpe {
12298
12601
  int index = 0;
12299
12602
  size_t offset = 0;
12300
12603
 
12604
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12605
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12606
+ offset = word.size();
12607
+ }
12608
+
12301
12609
  while (offset < word.size()) {
12302
12610
  llm_symbol sym;
12303
12611
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12483,16 +12791,16 @@ struct llm_tokenizer_wpm {
12483
12791
  // to lowercase, pad chinese characters, pad punctuation
12484
12792
  std::string new_str = "";
12485
12793
  for (uint32_t code : cpts_nfd) {
12486
- int type = unicode_cpt_type(code);
12487
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12794
+ const codepoint_flags flags = unicode_cpt_flags(code);
12795
+ if (flags.is_accent_mark || flags.is_control) {
12488
12796
  continue;
12489
12797
  }
12490
12798
  code = unicode_tolower(code);
12491
- if (type == CODEPOINT_TYPE_SEPARATOR) {
12799
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12492
12800
  code = ' ';
12493
12801
  }
12494
12802
  std::string s = unicode_cpt_to_utf8(code);
12495
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12803
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12496
12804
  new_str += " ";
12497
12805
  new_str += s;
12498
12806
  new_str += " ";
@@ -12695,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12695
13003
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12696
13004
  // tokenizer.encode('', add_special_tokens=False) returns []
12697
13005
 
13006
+ static const bool rtrim = true; //TODO: as param
13007
+ bool is_prev_special = false;
13008
+ bool special_token_rtrim = false;
13009
+
12698
13010
  if (add_special && vocab.special_add_bos != 0) {
12699
13011
  GGML_ASSERT(vocab.special_bos_id != -1);
12700
13012
  output.push_back(vocab.special_bos_id);
13013
+ is_prev_special = true;
12701
13014
  }
12702
13015
 
12703
13016
  for (const auto & fragment : fragment_buffer) {
@@ -12709,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12709
13022
  // and passing 'add space prefix' as bool argument
12710
13023
  //
12711
13024
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12712
- if (&fragment == &fragment_buffer.front()) {
12713
- if (vocab.add_space_prefix) {
12714
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13025
+
13026
+ if (special_token_rtrim) {
13027
+ size_t num_whitespaces = 0;
13028
+ while (isspace(raw_text[num_whitespaces])) {
13029
+ num_whitespaces++;
13030
+ }
13031
+ if (num_whitespaces == raw_text.size()) {
13032
+ continue; // skip if all whitespaces
13033
+ }
13034
+ raw_text = raw_text.substr(num_whitespaces);
13035
+ }
13036
+
13037
+ if (vocab.add_space_prefix) {
13038
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13039
+ raw_text = " " + raw_text;
12715
13040
  }
12716
13041
  }
12717
13042
 
@@ -12723,9 +13048,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12723
13048
  tokenizer.tokenize(raw_text, output);
12724
13049
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12725
13050
  output.push_back(fragment.token);
13051
+ is_prev_special = true;
13052
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13053
+ special_token_rtrim = rtrim
13054
+ && fragment.token != vocab.special_bos_id
13055
+ && fragment.token != vocab.special_unk_id
13056
+ && fragment.token != vocab.special_eos_id;
12726
13057
  }
12727
13058
  }
12728
13059
 
13060
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13061
+ LLAMA_LOG_WARN(
13062
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13063
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13064
+ "Are you sure this is what you want?\n", __FUNCTION__);
13065
+ }
13066
+
12729
13067
  if (add_special && vocab.special_add_eos == 1) {
12730
13068
  GGML_ASSERT(vocab.special_eos_id != -1);
12731
13069
  output.push_back(vocab.special_eos_id);
@@ -12752,7 +13090,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12752
13090
  }
12753
13091
  }
12754
13092
 
12755
- GGML_ASSERT(vocab.special_add_eos != 1);
13093
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13094
+ LLAMA_LOG_WARN(
13095
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13096
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13097
+ "Are you sure this is what you want?\n", __FUNCTION__);
13098
+ }
13099
+
13100
+ if (add_special && vocab.special_add_eos == 1) {
13101
+ GGML_ASSERT(vocab.special_add_eos != -1);
13102
+ output.push_back(vocab.special_eos_id);
13103
+ }
12756
13104
  } break;
12757
13105
  case LLAMA_VOCAB_TYPE_WPM:
12758
13106
  {
@@ -13106,6 +13454,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13106
13454
  return rejects;
13107
13455
  }
13108
13456
 
13457
+ static bool llama_grammar_detect_left_recursion(
13458
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13459
+ size_t rule_index,
13460
+ std::vector<bool> * rules_visited,
13461
+ std::vector<bool> * rules_in_progress,
13462
+ std::vector<bool> * rules_may_be_empty) {
13463
+ if ((*rules_in_progress)[rule_index]) {
13464
+ return true;
13465
+ }
13466
+
13467
+ (*rules_in_progress)[rule_index] = true;
13468
+
13469
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13470
+
13471
+ // First check if the rule might produce the empty string. This could be done combined with the second
13472
+ // step but it's more readable as two steps.
13473
+ bool at_rule_start = true;
13474
+ for (size_t i = 0; i < rule.size(); i++) {
13475
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13476
+ if (at_rule_start) {
13477
+ (*rules_may_be_empty)[rule_index] = true;
13478
+ break;
13479
+ }
13480
+ at_rule_start = true;
13481
+ } else {
13482
+ at_rule_start = false;
13483
+ }
13484
+ }
13485
+
13486
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13487
+ // be empty)
13488
+ bool recurse_into_nonterminal = true;
13489
+ for (size_t i = 0; i < rule.size(); i++) {
13490
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13491
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13492
+ return true;
13493
+ }
13494
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13495
+ recurse_into_nonterminal = false;
13496
+ }
13497
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13498
+ recurse_into_nonterminal = true;
13499
+ } else {
13500
+ recurse_into_nonterminal = false;
13501
+ }
13502
+ }
13503
+
13504
+ (*rules_in_progress)[rule_index] = false;
13505
+ (*rules_visited)[rule_index] = true;
13506
+ return false;
13507
+ }
13508
+
13109
13509
  //
13110
13510
  // grammar - external
13111
13511
  //
@@ -13125,6 +13525,19 @@ struct llama_grammar * llama_grammar_init(
13125
13525
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13126
13526
  }
13127
13527
 
13528
+ // Check for left recursion
13529
+ std::vector<bool> rules_visited(n_rules);
13530
+ std::vector<bool> rules_in_progress(n_rules);
13531
+ std::vector<bool> rules_may_be_empty(n_rules);
13532
+ for (size_t i = 0; i < n_rules; i++) {
13533
+ if (rules_visited[i]) {
13534
+ continue;
13535
+ }
13536
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13537
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13538
+ }
13539
+ }
13540
+
13128
13541
  // loop over alternates of start rule to build initial stacks
13129
13542
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13130
13543
  pos = vec_rules[start_rule_index].data();
@@ -13147,6 +13560,9 @@ struct llama_grammar * llama_grammar_init(
13147
13560
  }
13148
13561
  } while (true);
13149
13562
 
13563
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13564
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13565
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13150
13566
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13151
13567
  }
13152
13568
 
@@ -13741,9 +14157,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13741
14157
 
13742
14158
  // Sample the next word X using top-k sampling
13743
14159
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13744
- if (ctx) {
13745
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13746
- }
14160
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13747
14161
  llama_token X = llama_sample_token(ctx, candidates);
13748
14162
  t_start_sample_us = ggml_time_us();
13749
14163
 
@@ -13757,9 +14171,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13757
14171
  // Update mu using the learning rate and error
13758
14172
  *mu = *mu - eta * e;
13759
14173
 
13760
- if (ctx) {
13761
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13762
- }
14174
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13763
14175
  return X;
13764
14176
  }
13765
14177
 
@@ -14344,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14344
14756
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14345
14757
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14346
14758
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14347
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14348
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14349
14759
  if (qs.model.type == MODEL_70B) {
14350
14760
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14351
14761
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15246,6 +15656,7 @@ struct llama_model_params llama_model_default_params() {
15246
15656
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15247
15657
  /*.main_gpu =*/ 0,
15248
15658
  /*.tensor_split =*/ nullptr,
15659
+ /*.rpc_servers =*/ nullptr,
15249
15660
  /*.progress_callback =*/ nullptr,
15250
15661
  /*.progress_callback_user_data =*/ nullptr,
15251
15662
  /*.kv_overrides =*/ nullptr,
@@ -15316,7 +15727,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15316
15727
  }
15317
15728
 
15318
15729
  size_t llama_max_devices(void) {
15319
- #if defined(GGML_USE_METAL)
15730
+ #if defined(GGML_USE_RPC)
15731
+ return GGML_RPC_MAX_SERVERS;
15732
+ #elif defined(GGML_USE_METAL)
15320
15733
  return 1;
15321
15734
  #elif defined(GGML_USE_CUDA)
15322
15735
  return GGML_CUDA_MAX_DEVICES;
@@ -15339,7 +15752,7 @@ bool llama_supports_mlock(void) {
15339
15752
 
15340
15753
  bool llama_supports_gpu_offload(void) {
15341
15754
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15342
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15755
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15343
15756
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15344
15757
  return true;
15345
15758
  #else
@@ -15356,10 +15769,6 @@ void llama_backend_init(void) {
15356
15769
  struct ggml_context * ctx = ggml_init(params);
15357
15770
  ggml_free(ctx);
15358
15771
  }
15359
-
15360
- #ifdef GGML_USE_MPI
15361
- ggml_mpi_backend_init();
15362
- #endif
15363
15772
  }
15364
15773
 
15365
15774
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15369,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15369
15778
  }
15370
15779
 
15371
15780
  void llama_backend_free(void) {
15372
- #ifdef GGML_USE_MPI
15373
- ggml_mpi_backend_free();
15374
- #endif
15375
15781
  ggml_quantize_free();
15376
15782
  }
15377
15783
 
@@ -15402,7 +15808,17 @@ struct llama_model * llama_load_model_from_file(
15402
15808
  return true;
15403
15809
  };
15404
15810
  }
15405
-
15811
+ if (params.rpc_servers != nullptr) {
15812
+ // split the servers set them into model->rpc_servers
15813
+ std::string servers(params.rpc_servers);
15814
+ size_t pos = 0;
15815
+ while ((pos = servers.find(",")) != std::string::npos) {
15816
+ std::string server = servers.substr(0, pos);
15817
+ model->rpc_servers.push_back(server);
15818
+ servers.erase(0, pos + 1);
15819
+ }
15820
+ model->rpc_servers.push_back(servers);
15821
+ }
15406
15822
  int status = llama_model_load(path_model, *model, params);
15407
15823
  GGML_ASSERT(status <= 0);
15408
15824
  if (status < 0) {
@@ -15441,6 +15857,11 @@ struct llama_context * llama_new_context_with_model(
15441
15857
  return nullptr;
15442
15858
  }
15443
15859
 
15860
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15861
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15862
+ params.flash_attn = false;
15863
+ }
15864
+
15444
15865
  llama_context * ctx = new llama_context(*model);
15445
15866
 
15446
15867
  const auto & hparams = model->hparams;
@@ -15464,7 +15885,7 @@ struct llama_context * llama_new_context_with_model(
15464
15885
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15465
15886
 
15466
15887
  // this is necessary due to kv_self.n being padded later during inference
15467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15888
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15468
15889
 
15469
15890
  // with causal attention, the batch size is limited by the context size
15470
15891
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15499,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
15499
15920
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15500
15921
  }
15501
15922
 
15923
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15502
15924
  cparams.causal_attn = hparams.causal_attn;
15503
15925
 
15504
15926
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15509,16 +15931,6 @@ struct llama_context * llama_new_context_with_model(
15509
15931
  }
15510
15932
  }
15511
15933
 
15512
- if (cparams.flash_attn && hparams.use_alibi) {
15513
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
- cparams.flash_attn = false;
15515
- }
15516
-
15517
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
- cparams.flash_attn = false;
15520
- }
15521
-
15522
15934
  if (params.seed == LLAMA_DEFAULT_SEED) {
15523
15935
  params.seed = time(NULL);
15524
15936
  }
@@ -15554,7 +15966,17 @@ struct llama_context * llama_new_context_with_model(
15554
15966
 
15555
15967
  if (!hparams.vocab_only) {
15556
15968
  // initialize backends
15557
- #ifdef GGML_USE_METAL
15969
+ #if defined(GGML_USE_RPC)
15970
+ for (auto & server : model->rpc_servers) {
15971
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15972
+ if (backend == nullptr) {
15973
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15974
+ llama_free(ctx);
15975
+ return nullptr;
15976
+ }
15977
+ ctx->backends.push_back(backend);
15978
+ }
15979
+ #elif defined(GGML_USE_METAL)
15558
15980
  if (model->n_gpu_layers > 0) {
15559
15981
  ctx->backend_metal = ggml_backend_metal_init();
15560
15982
  if (ctx->backend_metal == nullptr) {
@@ -15710,7 +16132,11 @@ struct llama_context * llama_new_context_with_model(
15710
16132
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15711
16133
 
15712
16134
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15713
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
16135
+ bool pipeline_parallel =
16136
+ llama_get_device_count(*model) > 1 &&
16137
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
16138
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
16139
+ params.offload_kqv;
15714
16140
  #ifndef GGML_USE_CUDA
15715
16141
  // pipeline parallelism requires support for async compute and events
15716
16142
  // currently this is only implemented in the CUDA backend
@@ -15753,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
15753
16179
  }
15754
16180
  }
15755
16181
 
15756
- #ifdef GGML_USE_MPI
15757
- ctx->ctx_mpi = ggml_mpi_init();
15758
-
15759
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15760
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15761
- // TODO: needs fix after #3228
15762
- GGML_ASSERT(false && "not implemented");
15763
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15764
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15765
- llama_backend_free();
15766
- exit(1);
15767
- }
15768
- #endif
15769
-
15770
16182
  return ctx;
15771
16183
  }
15772
16184
 
@@ -15803,11 +16215,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15803
16215
  // these models do not use RoPE
15804
16216
  case LLM_ARCH_GPT2:
15805
16217
  case LLM_ARCH_GPTJ:
15806
- case LLM_ARCH_GPTNEOX:
15807
16218
  case LLM_ARCH_MPT:
15808
16219
  case LLM_ARCH_REFACT:
15809
16220
  case LLM_ARCH_BLOOM:
15810
16221
  case LLM_ARCH_MAMBA:
16222
+ case LLM_ARCH_JINA_BERT_V2:
15811
16223
  return LLAMA_ROPE_TYPE_NONE;
15812
16224
 
15813
16225
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -15822,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15822
16234
  case LLM_ARCH_XVERSE:
15823
16235
  case LLM_ARCH_COMMAND_R:
15824
16236
  case LLM_ARCH_OLMO:
16237
+ case LLM_ARCH_ARCTIC:
15825
16238
  return LLAMA_ROPE_TYPE_NORM;
15826
16239
 
15827
16240
  // the pairs of head values are offset by n_rot/2
15828
16241
  case LLM_ARCH_FALCON:
15829
16242
  case LLM_ARCH_GROK:
15830
16243
  case LLM_ARCH_DBRX:
15831
- case LLM_ARCH_PERSIMMON:
15832
16244
  case LLM_ARCH_BERT:
15833
16245
  case LLM_ARCH_NOMIC_BERT:
15834
16246
  case LLM_ARCH_STABLELM:
@@ -15839,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15839
16251
  case LLM_ARCH_PHI3:
15840
16252
  case LLM_ARCH_GEMMA:
15841
16253
  case LLM_ARCH_STARCODER2:
16254
+ case LLM_ARCH_GPTNEOX:
15842
16255
  return LLAMA_ROPE_TYPE_NEOX;
15843
16256
 
15844
16257
  // all model arches should be listed explicitly here
@@ -15998,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
15998
16411
  }
15999
16412
 
16000
16413
  // make tensors
16414
+ cvec.tensors.reserve(model.hparams.n_layer);
16001
16415
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16002
16416
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16003
16417
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16006,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16006
16420
  }
16007
16421
 
16008
16422
  // allocate tensors / buffers and zero
16423
+ cvec.ctxs.reserve(ctx_map.size());
16424
+ cvec.bufs.reserve(ctx_map.size());
16009
16425
  for (auto it : ctx_map) {
16010
16426
  ggml_backend_buffer_type_t buft = it.first;
16011
16427
  ggml_context * ctx = it.second;
@@ -16829,13 +17245,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16829
17245
  }
16830
17246
  else {
16831
17247
  if (cell_range_begin != kv_self.size) {
16832
- cell_ranges.push_back({ cell_range_begin, i });
17248
+ cell_ranges.emplace_back(cell_range_begin, i);
16833
17249
  cell_range_begin = kv_self.size;
16834
17250
  }
16835
17251
  }
16836
17252
  }
16837
17253
  if (cell_range_begin != kv_self.size) {
16838
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17254
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16839
17255
  }
16840
17256
 
16841
17257
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -17214,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17214
17630
  ctx->cparams.n_threads_batch = n_threads_batch;
17215
17631
  }
17216
17632
 
17633
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17634
+ return ctx->cparams.n_threads;
17635
+ }
17636
+
17637
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17638
+ return ctx->cparams.n_threads_batch;
17639
+ }
17640
+
17217
17641
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17218
17642
  ctx->abort_callback = abort_callback;
17219
17643
  ctx->abort_callback_data = abort_callback_data;
@@ -17648,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
17648
18072
  }
17649
18073
  }
17650
18074
  // llama2 templates seem to not care about "add_generation_prompt"
18075
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18076
+ // Phi 3
18077
+ for (auto message : chat) {
18078
+ std::string role(message->role);
18079
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18080
+ }
18081
+ if (add_ass) {
18082
+ ss << "<|assistant|>\n";
18083
+ }
17651
18084
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17652
18085
  // zephyr template
17653
18086
  for (auto message : chat) {
@@ -17780,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
17780
18213
  if (add_ass) {
17781
18214
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17782
18215
  }
17783
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17784
- // Phi 3
17785
- for (auto message : chat) {
17786
- std::string role(message->role);
17787
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17788
- }
17789
- if (add_ass) {
17790
- ss << "<|assistant|>\n";
17791
- }
17792
18216
  } else {
17793
18217
  // template not supported
17794
18218
  return -1;
@@ -17910,6 +18334,7 @@ const char * llama_print_system_info(void) {
17910
18334
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
17911
18335
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
17912
18336
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18337
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
17913
18338
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
17914
18339
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
17915
18340
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
@@ -17970,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
17970
18395
  g_state.log_callback_user_data = user_data;
17971
18396
  #ifdef GGML_USE_METAL
17972
18397
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18398
+ #elif defined(GGML_USE_CUDA)
18399
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
17973
18400
  #endif
17974
18401
  }
17975
18402