llama_cpp 0.15.2 → 0.15.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,16 +26,9 @@
26
26
  #ifdef GGML_USE_METAL
27
27
  # include "ggml-metal.h"
28
28
  #endif
29
- #ifdef GGML_USE_MPI
30
- # include "ggml-mpi.h"
31
- #endif
32
- #ifndef QK_K
33
- # ifdef GGML_QKK_64
34
- # define QK_K 64
35
- # else
36
- # define QK_K 256
37
- # endif
38
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
39
32
 
40
33
  #ifdef __has_include
41
34
  #if __has_include(<unistd.h>)
@@ -110,7 +103,7 @@
110
103
  #endif
111
104
 
112
105
  #define LLAMA_MAX_NODES 8192
113
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 160
114
107
 
115
108
  //
116
109
  // logging
@@ -205,7 +198,6 @@ enum llm_arch {
205
198
  LLM_ARCH_GPTNEOX,
206
199
  LLM_ARCH_MPT,
207
200
  LLM_ARCH_STARCODER,
208
- LLM_ARCH_PERSIMMON,
209
201
  LLM_ARCH_REFACT,
210
202
  LLM_ARCH_BERT,
211
203
  LLM_ARCH_NOMIC_BERT,
@@ -229,6 +221,8 @@ enum llm_arch {
229
221
  LLM_ARCH_COMMAND_R,
230
222
  LLM_ARCH_DBRX,
231
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
225
+ LLM_ARCH_DEEPSEEK2,
232
226
  LLM_ARCH_UNKNOWN,
233
227
  };
234
228
 
@@ -242,7 +236,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
242
236
  { LLM_ARCH_MPT, "mpt" },
243
237
  { LLM_ARCH_BAICHUAN, "baichuan" },
244
238
  { LLM_ARCH_STARCODER, "starcoder" },
245
- { LLM_ARCH_PERSIMMON, "persimmon" },
246
239
  { LLM_ARCH_REFACT, "refact" },
247
240
  { LLM_ARCH_BERT, "bert" },
248
241
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
@@ -266,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
259
  { LLM_ARCH_COMMAND_R, "command-r" },
267
260
  { LLM_ARCH_DBRX, "dbrx" },
268
261
  { LLM_ARCH_OLMO, "olmo" },
262
+ { LLM_ARCH_ARCTIC, "arctic" },
263
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
269
264
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
265
  };
271
266
 
@@ -286,11 +281,15 @@ enum llm_kv {
286
281
  LLM_KV_CONTEXT_LENGTH,
287
282
  LLM_KV_EMBEDDING_LENGTH,
288
283
  LLM_KV_BLOCK_COUNT,
284
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
289
285
  LLM_KV_FEED_FORWARD_LENGTH,
286
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
290
287
  LLM_KV_USE_PARALLEL_RESIDUAL,
291
288
  LLM_KV_TENSOR_DATA_LAYOUT,
292
289
  LLM_KV_EXPERT_COUNT,
293
290
  LLM_KV_EXPERT_USED_COUNT,
291
+ LLM_KV_EXPERT_SHARED_COUNT,
292
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
294
293
  LLM_KV_POOLING_TYPE,
295
294
  LLM_KV_LOGIT_SCALE,
296
295
 
@@ -303,14 +302,18 @@ enum llm_kv {
303
302
  LLM_KV_ATTENTION_LAYERNORM_EPS,
304
303
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
305
304
  LLM_KV_ATTENTION_CAUSAL,
305
+ LLM_KV_ATTENTION_Q_LORA_RANK,
306
+ LLM_KV_ATTENTION_KV_LORA_RANK,
306
307
 
307
308
  LLM_KV_ROPE_DIMENSION_COUNT,
308
309
  LLM_KV_ROPE_FREQ_BASE,
309
310
  LLM_KV_ROPE_SCALE_LINEAR,
310
311
  LLM_KV_ROPE_SCALING_TYPE,
311
312
  LLM_KV_ROPE_SCALING_FACTOR,
313
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
312
314
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
313
315
  LLM_KV_ROPE_SCALING_FINETUNED,
316
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
314
317
 
315
318
  LLM_KV_SPLIT_NO,
316
319
  LLM_KV_SPLIT_COUNT,
@@ -359,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
359
362
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
360
363
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
361
364
 
362
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
363
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
364
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
365
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
366
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
367
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
368
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
369
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
370
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
371
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
372
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
365
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
366
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
367
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
368
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
369
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
370
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
371
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
372
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
373
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
374
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
375
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
376
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
377
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
378
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
379
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
373
380
 
374
381
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
375
382
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -380,14 +387,18 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
380
387
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
381
388
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
382
389
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
390
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
391
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
383
392
 
384
393
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
385
394
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
386
395
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
387
396
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
388
397
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
398
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
389
399
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
390
400
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
401
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
391
402
 
392
403
  { LLM_KV_SPLIT_NO, "split.no" },
393
404
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -441,6 +452,8 @@ enum llm_tensor {
441
452
  LLM_TENSOR_OUTPUT,
442
453
  LLM_TENSOR_OUTPUT_NORM,
443
454
  LLM_TENSOR_ROPE_FREQS,
455
+ LLM_TENSOR_ROPE_FACTORS_LONG,
456
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
444
457
  LLM_TENSOR_ATTN_Q,
445
458
  LLM_TENSOR_ATTN_K,
446
459
  LLM_TENSOR_ATTN_V,
@@ -460,6 +473,7 @@ enum llm_tensor {
460
473
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
461
474
  LLM_TENSOR_FFN_GATE_EXP,
462
475
  LLM_TENSOR_FFN_UP_EXP,
476
+ LLM_TENSOR_FFN_NORM_EXPS,
463
477
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
464
478
  LLM_TENSOR_FFN_GATE_EXPS,
465
479
  LLM_TENSOR_FFN_UP_EXPS,
@@ -476,6 +490,12 @@ enum llm_tensor {
476
490
  LLM_TENSOR_SSM_A,
477
491
  LLM_TENSOR_SSM_D,
478
492
  LLM_TENSOR_SSM_OUT,
493
+ LLM_TENSOR_ATTN_Q_A,
494
+ LLM_TENSOR_ATTN_Q_B,
495
+ LLM_TENSOR_ATTN_KV_A_MQA,
496
+ LLM_TENSOR_ATTN_KV_B,
497
+ LLM_TENSOR_ATTN_Q_A_NORM,
498
+ LLM_TENSOR_ATTN_KV_A_NORM,
479
499
  };
480
500
 
481
501
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -598,23 +618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
598
618
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
599
619
  },
600
620
  },
601
- {
602
- LLM_ARCH_PERSIMMON,
603
- {
604
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
605
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
606
- { LLM_TENSOR_OUTPUT, "output"},
607
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
608
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
609
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
610
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
611
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
612
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
613
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
614
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
615
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
616
- },
617
- },
618
621
  {
619
622
  LLM_ARCH_MPT,
620
623
  {
@@ -825,18 +828,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
825
828
  {
826
829
  LLM_ARCH_PHI3,
827
830
  {
828
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
- { LLM_TENSOR_OUTPUT, "output" },
831
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
833
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
834
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
835
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
836
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
837
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
838
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
839
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
831
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
832
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
833
+ { LLM_TENSOR_OUTPUT, "output" },
834
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
835
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
836
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
837
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
838
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
839
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
840
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
841
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
842
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
843
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
844
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
840
845
  },
841
846
  },
842
847
  {
@@ -1052,6 +1057,57 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1052
1057
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1053
1058
  },
1054
1059
  },
1060
+ {
1061
+ LLM_ARCH_ARCTIC,
1062
+ {
1063
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1064
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1065
+ { LLM_TENSOR_OUTPUT, "output" },
1066
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1067
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1068
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1069
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1070
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1071
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1072
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1073
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1074
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1075
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1076
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1077
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1078
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1079
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1080
+ },
1081
+ },
1082
+ {
1083
+ LLM_ARCH_DEEPSEEK2,
1084
+ {
1085
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1086
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1087
+ { LLM_TENSOR_OUTPUT, "output" },
1088
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1089
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1090
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1091
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1092
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1093
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1094
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1095
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1096
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1097
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1098
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1099
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1100
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1101
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1102
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1103
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1104
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1105
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1106
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1107
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1108
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1109
+ },
1110
+ },
1055
1111
  {
1056
1112
  LLM_ARCH_UNKNOWN,
1057
1113
  {
@@ -1646,12 +1702,13 @@ struct llama_mlock {
1646
1702
  };
1647
1703
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1648
1704
 
1649
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1705
+ // NOTE: avoid ever using this except for building the token_to_piece caches
1706
+ static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1650
1707
  std::vector<char> result(8, 0);
1651
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1708
+ const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1652
1709
  if (n_tokens < 0) {
1653
1710
  result.resize(-n_tokens);
1654
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1711
+ int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1655
1712
  GGML_ASSERT(check == -n_tokens);
1656
1713
  }
1657
1714
  else {
@@ -1697,6 +1754,8 @@ struct llama_state {
1697
1754
  llama_state() {
1698
1755
  #ifdef GGML_USE_METAL
1699
1756
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1757
+ #elif defined(GGML_USE_CUDA)
1758
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1700
1759
  #endif
1701
1760
  }
1702
1761
 
@@ -1710,23 +1769,31 @@ static llama_state g_state;
1710
1769
  // available llama models
1711
1770
  enum e_model {
1712
1771
  MODEL_UNKNOWN,
1772
+ MODEL_14M,
1713
1773
  MODEL_17M,
1714
1774
  MODEL_22M,
1715
1775
  MODEL_33M,
1776
+ MODEL_70M,
1716
1777
  MODEL_109M,
1717
1778
  MODEL_137M,
1779
+ MODEL_160M,
1718
1780
  MODEL_335M,
1781
+ MODEL_410M,
1719
1782
  MODEL_0_5B,
1720
1783
  MODEL_1B,
1784
+ MODEL_1_4B,
1721
1785
  MODEL_2B,
1786
+ MODEL_2_8B,
1722
1787
  MODEL_3B,
1723
1788
  MODEL_4B,
1789
+ MODEL_6_9B,
1724
1790
  MODEL_7B,
1725
1791
  MODEL_8B,
1726
1792
  MODEL_12B,
1727
1793
  MODEL_13B,
1728
1794
  MODEL_14B,
1729
1795
  MODEL_15B,
1796
+ MODEL_16B,
1730
1797
  MODEL_20B,
1731
1798
  MODEL_30B,
1732
1799
  MODEL_34B,
@@ -1734,6 +1801,7 @@ enum e_model {
1734
1801
  MODEL_40B,
1735
1802
  MODEL_65B,
1736
1803
  MODEL_70B,
1804
+ MODEL_236B,
1737
1805
  MODEL_314B,
1738
1806
  MODEL_SMALL,
1739
1807
  MODEL_MEDIUM,
@@ -1743,6 +1811,7 @@ enum e_model {
1743
1811
  MODEL_8x7B,
1744
1812
  MODEL_8x22B,
1745
1813
  MODEL_16x12B,
1814
+ MODEL_10B_128x3_66B,
1746
1815
  };
1747
1816
 
1748
1817
  static const size_t kiB = 1024;
@@ -1752,6 +1821,7 @@ static const size_t GiB = 1024*MiB;
1752
1821
  struct llama_hparams {
1753
1822
  bool vocab_only;
1754
1823
  bool rope_finetuned;
1824
+ bool use_par_res;
1755
1825
 
1756
1826
  uint32_t n_vocab;
1757
1827
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1767,12 +1837,21 @@ struct llama_hparams {
1767
1837
  uint32_t n_expert_used = 0;
1768
1838
  uint32_t n_vocab_type = 0; // for BERT-style token types
1769
1839
 
1840
+ uint32_t n_layer_dense_lead = 0;
1841
+ uint32_t n_lora_q = 0;
1842
+ uint32_t n_lora_kv = 0;
1843
+ uint32_t n_ff_exp = 0;
1844
+ uint32_t n_expert_shared = 0;
1845
+ float expert_weights_scale = 0.0;
1846
+
1770
1847
  float f_norm_eps;
1771
1848
  float f_norm_rms_eps;
1772
1849
 
1850
+ float rope_attn_factor = 1.0f;
1773
1851
  float rope_freq_base_train;
1774
1852
  float rope_freq_scale_train;
1775
1853
  uint32_t n_yarn_orig_ctx;
1854
+ float rope_yarn_log_mul;
1776
1855
 
1777
1856
  // for State Space Models
1778
1857
  uint32_t ssm_d_conv = 0;
@@ -1806,6 +1885,12 @@ struct llama_hparams {
1806
1885
  if (this->n_expert != other.n_expert) return true;
1807
1886
  if (this->n_expert_used != other.n_expert_used) return true;
1808
1887
 
1888
+ if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
1889
+ if (this->n_lora_q != other.n_lora_q) return true;
1890
+ if (this->n_lora_kv != other.n_lora_kv) return true;
1891
+ if (this->n_ff_exp != other.n_ff_exp) return true;
1892
+ if (this->n_expert_shared != other.n_expert_shared) return true;
1893
+
1809
1894
  if (this->rope_finetuned != other.rope_finetuned) return true;
1810
1895
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1811
1896
 
@@ -1818,8 +1903,11 @@ struct llama_hparams {
1818
1903
 
1819
1904
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1820
1905
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1906
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1821
1907
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1822
1908
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1909
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
1910
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
1823
1911
 
1824
1912
  return false;
1825
1913
  }
@@ -1895,6 +1983,8 @@ struct llama_layer {
1895
1983
  struct ggml_tensor * attn_k_norm_b;
1896
1984
  struct ggml_tensor * attn_out_norm;
1897
1985
  struct ggml_tensor * attn_out_norm_b;
1986
+ struct ggml_tensor * attn_q_a_norm;
1987
+ struct ggml_tensor * attn_kv_a_norm;
1898
1988
 
1899
1989
  // attention
1900
1990
  struct ggml_tensor * wq;
@@ -1902,6 +1992,10 @@ struct llama_layer {
1902
1992
  struct ggml_tensor * wv;
1903
1993
  struct ggml_tensor * wo;
1904
1994
  struct ggml_tensor * wqkv;
1995
+ struct ggml_tensor * wq_a;
1996
+ struct ggml_tensor * wq_b;
1997
+ struct ggml_tensor * wkv_a_mqa;
1998
+ struct ggml_tensor * wkv_b;
1905
1999
 
1906
2000
  // attention bias
1907
2001
  struct ggml_tensor * bq;
@@ -1915,6 +2009,7 @@ struct llama_layer {
1915
2009
  struct ggml_tensor * ffn_norm_b;
1916
2010
  struct ggml_tensor * layer_out_norm;
1917
2011
  struct ggml_tensor * layer_out_norm_b;
2012
+ struct ggml_tensor * ffn_norm_exps;
1918
2013
 
1919
2014
  // ff
1920
2015
  struct ggml_tensor * ffn_gate; // w1
@@ -1934,8 +2029,9 @@ struct llama_layer {
1934
2029
  struct ggml_tensor * ffn_up_shexp;
1935
2030
 
1936
2031
  // ff bias
1937
- struct ggml_tensor * ffn_down_b; // b2
1938
- struct ggml_tensor * ffn_up_b; // b3
2032
+ struct ggml_tensor * ffn_gate_b = nullptr;
2033
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2034
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1939
2035
  struct ggml_tensor * ffn_act;
1940
2036
 
1941
2037
  // mamba proj
@@ -1952,6 +2048,10 @@ struct llama_layer {
1952
2048
  // mamba bias
1953
2049
  struct ggml_tensor * ssm_conv1d_b;
1954
2050
  struct ggml_tensor * ssm_dt_b;
2051
+
2052
+ // long rope factors
2053
+ struct ggml_tensor * rope_long = nullptr;
2054
+ struct ggml_tensor * rope_short = nullptr;
1955
2055
  };
1956
2056
 
1957
2057
  struct llama_kv_cell {
@@ -2063,7 +2163,9 @@ struct llama_vocab {
2063
2163
  std::unordered_map<token, id> token_to_id;
2064
2164
  std::vector<token_data> id_to_token;
2065
2165
 
2066
- std::unordered_map<token, id> special_tokens_cache;
2166
+ std::vector<id> cache_special_tokens;
2167
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
+ std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2067
2169
 
2068
2170
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2069
2171
 
@@ -2268,10 +2370,6 @@ struct llama_context {
2268
2370
 
2269
2371
  // control vectors
2270
2372
  struct llama_control_vector cvec;
2271
-
2272
- #ifdef GGML_USE_MPI
2273
- ggml_mpi_context * ctx_mpi = NULL;
2274
- #endif
2275
2373
  };
2276
2374
 
2277
2375
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
@@ -2491,7 +2589,6 @@ static bool llama_kv_cache_init(
2491
2589
  static bool llama_kv_cache_find_slot(
2492
2590
  struct llama_kv_cache & cache,
2493
2591
  const struct llama_batch & batch) {
2494
- const uint32_t n_ctx = cache.size;
2495
2592
  const uint32_t n_tokens = batch.n_tokens;
2496
2593
 
2497
2594
  if (cache.recurrent) {
@@ -2542,16 +2639,16 @@ static bool llama_kv_cache_find_slot(
2542
2639
  }
2543
2640
  // otherwise, one cell per token.
2544
2641
 
2545
- if (n_tokens > n_ctx) {
2546
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2642
+ if (n_tokens > cache.size) {
2643
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2547
2644
  return false;
2548
2645
  }
2549
2646
 
2550
2647
  uint32_t n_tested = 0;
2551
2648
 
2552
2649
  while (true) {
2553
- if (cache.head + n_tokens > n_ctx) {
2554
- n_tested += n_ctx - cache.head;
2650
+ if (cache.head + n_tokens > cache.size) {
2651
+ n_tested += cache.size - cache.head;
2555
2652
  cache.head = 0;
2556
2653
  continue;
2557
2654
  }
@@ -2570,7 +2667,7 @@ static bool llama_kv_cache_find_slot(
2570
2667
  break;
2571
2668
  }
2572
2669
 
2573
- if (n_tested >= n_ctx) {
2670
+ if (n_tested >= cache.size) {
2574
2671
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2575
2672
  return false;
2576
2673
  }
@@ -3330,6 +3427,39 @@ struct llama_model_loader {
3330
3427
  return get_arr_n(llm_kv(kid), result, required);
3331
3428
  }
3332
3429
 
3430
+ template<typename T>
3431
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3432
+ const int kid = gguf_find_key(meta, key.c_str());
3433
+
3434
+ if (kid < 0) {
3435
+ if (required) {
3436
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3437
+ }
3438
+ return false;
3439
+ }
3440
+
3441
+ struct GGUFMeta::ArrayInfo arr_info =
3442
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3443
+
3444
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3445
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3446
+ }
3447
+
3448
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3449
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3450
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3451
+
3452
+ result.resize(arr_info.length);
3453
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3454
+
3455
+ return true;
3456
+ }
3457
+
3458
+ template<typename T>
3459
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3460
+ return get_arr(llm_kv(kid), result, required);
3461
+ }
3462
+
3333
3463
  template<typename T>
3334
3464
  bool get_key(const std::string & key, T & result, const bool required = true) {
3335
3465
  auto it = kv_overrides.find(key);
@@ -3404,11 +3534,15 @@ struct llama_model_loader {
3404
3534
  return get_tensor_meta(get_tensor_name(i));
3405
3535
  }
3406
3536
 
3407
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3537
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3408
3538
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3409
3539
  ggml_set_name(tensor, ggml_get_name(cur));
3410
3540
 
3411
- n_created++;
3541
+ if (duplicated) {
3542
+ size_data += ggml_nbytes(cur);
3543
+ } else {
3544
+ n_created++;
3545
+ }
3412
3546
 
3413
3547
  return tensor;
3414
3548
  }
@@ -3443,14 +3577,17 @@ struct llama_model_loader {
3443
3577
  return cur;
3444
3578
  }
3445
3579
 
3446
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3447
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3580
+ static const int TENSOR_NOT_REQUIRED = 1;
3581
+ static const int TENSOR_DUPLICATED = 2;
3582
+
3583
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3584
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3448
3585
 
3449
3586
  if (cur == NULL) {
3450
3587
  return NULL;
3451
3588
  }
3452
3589
 
3453
- return create_tensor_for(ctx, cur);
3590
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3454
3591
  }
3455
3592
 
3456
3593
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3750,37 +3887,50 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3750
3887
 
3751
3888
  static const char * llama_model_type_name(e_model type) {
3752
3889
  switch (type) {
3753
- case MODEL_22M: return "22M";
3754
- case MODEL_33M: return "33M";
3755
- case MODEL_109M: return "109M";
3756
- case MODEL_137M: return "137M";
3757
- case MODEL_0_5B: return "0.5B";
3758
- case MODEL_1B: return "1B";
3759
- case MODEL_2B: return "2B";
3760
- case MODEL_3B: return "3B";
3761
- case MODEL_7B: return "7B";
3762
- case MODEL_8B: return "8B";
3763
- case MODEL_12B: return "12B";
3764
- case MODEL_13B: return "13B";
3765
- case MODEL_14B: return "14B";
3766
- case MODEL_15B: return "15B";
3767
- case MODEL_20B: return "20B";
3768
- case MODEL_30B: return "30B";
3769
- case MODEL_34B: return "34B";
3770
- case MODEL_35B: return "35B";
3771
- case MODEL_40B: return "40B";
3772
- case MODEL_65B: return "65B";
3773
- case MODEL_70B: return "70B";
3774
- case MODEL_314B: return "314B";
3775
- case MODEL_SMALL: return "0.1B";
3776
- case MODEL_MEDIUM: return "0.4B";
3777
- case MODEL_LARGE: return "0.8B";
3778
- case MODEL_XL: return "1.5B";
3779
- case MODEL_A2_7B: return "A2.7B";
3780
- case MODEL_8x7B: return "8x7B";
3781
- case MODEL_8x22B: return "8x22B";
3782
- case MODEL_16x12B: return "16x12B";
3783
- default: return "?B";
3890
+ case MODEL_14M: return "14M";
3891
+ case MODEL_17M: return "17M";
3892
+ case MODEL_22M: return "22M";
3893
+ case MODEL_33M: return "33M";
3894
+ case MODEL_70M: return "70M";
3895
+ case MODEL_109M: return "109M";
3896
+ case MODEL_137M: return "137M";
3897
+ case MODEL_160M: return "160M";
3898
+ case MODEL_335M: return "335M";
3899
+ case MODEL_410M: return "410M";
3900
+ case MODEL_0_5B: return "0.5B";
3901
+ case MODEL_1B: return "1B";
3902
+ case MODEL_1_4B: return "1.4B";
3903
+ case MODEL_2B: return "2B";
3904
+ case MODEL_2_8B: return "2.8B";
3905
+ case MODEL_3B: return "3B";
3906
+ case MODEL_4B: return "4B";
3907
+ case MODEL_6_9B: return "6.9B";
3908
+ case MODEL_7B: return "7B";
3909
+ case MODEL_8B: return "8B";
3910
+ case MODEL_12B: return "12B";
3911
+ case MODEL_13B: return "13B";
3912
+ case MODEL_14B: return "14B";
3913
+ case MODEL_15B: return "15B";
3914
+ case MODEL_16B: return "16B";
3915
+ case MODEL_20B: return "20B";
3916
+ case MODEL_30B: return "30B";
3917
+ case MODEL_34B: return "34B";
3918
+ case MODEL_35B: return "35B";
3919
+ case MODEL_40B: return "40B";
3920
+ case MODEL_65B: return "65B";
3921
+ case MODEL_70B: return "70B";
3922
+ case MODEL_236B: return "236B";
3923
+ case MODEL_314B: return "314B";
3924
+ case MODEL_SMALL: return "0.1B";
3925
+ case MODEL_MEDIUM: return "0.4B";
3926
+ case MODEL_LARGE: return "0.8B";
3927
+ case MODEL_XL: return "1.5B";
3928
+ case MODEL_A2_7B: return "A2.7B";
3929
+ case MODEL_8x7B: return "8x7B";
3930
+ case MODEL_8x22B: return "8x22B";
3931
+ case MODEL_16x12B: return "16x12B";
3932
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3933
+ default: return "?B";
3784
3934
  }
3785
3935
  }
3786
3936
 
@@ -3873,6 +4023,8 @@ static void llm_load_hparams(
3873
4023
  }
3874
4024
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3875
4025
 
4026
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
4027
+
3876
4028
  // sanity check for n_rot (optional)
3877
4029
  {
3878
4030
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3910,7 +4062,9 @@ static void llm_load_hparams(
3910
4062
  switch (hparams.n_layer) {
3911
4063
  case 22: model.type = e_model::MODEL_1B; break;
3912
4064
  case 26: model.type = e_model::MODEL_3B; break;
3913
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4065
+ // granite uses a vocab with len 49152
4066
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4067
+ case 36: model.type = e_model::MODEL_8B; break; // granite
3914
4068
  case 40: model.type = e_model::MODEL_13B; break;
3915
4069
  case 48: model.type = e_model::MODEL_34B; break;
3916
4070
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3972,14 +4126,6 @@ static void llm_load_hparams(
3972
4126
  default: model.type = e_model::MODEL_UNKNOWN;
3973
4127
  }
3974
4128
  } break;
3975
- case LLM_ARCH_PERSIMMON:
3976
- {
3977
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3978
- switch (hparams.n_layer) {
3979
- case 36: model.type = e_model::MODEL_8B; break;
3980
- default: model.type = e_model::MODEL_UNKNOWN;
3981
- }
3982
- } break;
3983
4129
  case LLM_ARCH_REFACT:
3984
4130
  {
3985
4131
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -4121,6 +4267,7 @@ static void llm_load_hparams(
4121
4267
  switch (hparams.n_layer) {
4122
4268
  case 24: model.type = e_model::MODEL_1B; break;
4123
4269
  case 32: model.type = e_model::MODEL_3B; break;
4270
+ case 40: model.type = e_model::MODEL_14B; break;
4124
4271
  default: model.type = e_model::MODEL_UNKNOWN;
4125
4272
  }
4126
4273
  } break;
@@ -4187,6 +4334,8 @@ static void llm_load_hparams(
4187
4334
  case 30: model.type = e_model::MODEL_3B; break;
4188
4335
  case 32: model.type = e_model::MODEL_7B; break;
4189
4336
  case 40: model.type = e_model::MODEL_15B; break;
4337
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4338
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4190
4339
  default: model.type = e_model::MODEL_UNKNOWN;
4191
4340
  }
4192
4341
  } break;
@@ -4261,6 +4410,85 @@ static void llm_load_hparams(
4261
4410
  default: model.type = e_model::MODEL_UNKNOWN;
4262
4411
  }
4263
4412
  } break;
4413
+ case LLM_ARCH_GPTNEOX:
4414
+ {
4415
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4416
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4417
+ switch (hparams.n_layer) {
4418
+ case 6:
4419
+ switch (hparams.n_ff) {
4420
+ case 512: model.type = e_model::MODEL_14M; break;
4421
+ case 2048: model.type = e_model::MODEL_70M; break;
4422
+ default: model.type = e_model::MODEL_UNKNOWN;
4423
+ } break;
4424
+ case 12:
4425
+ switch (hparams.n_ff) {
4426
+ case 3072: model.type = e_model::MODEL_160M; break;
4427
+ default: model.type = e_model::MODEL_UNKNOWN;
4428
+ } break;
4429
+ case 16:
4430
+ switch (hparams.n_ff) {
4431
+ case 8192: model.type = e_model::MODEL_1B; break;
4432
+ default: model.type = e_model::MODEL_UNKNOWN;
4433
+ } break;
4434
+ case 24:
4435
+ switch (hparams.n_ff) {
4436
+ case 4096: model.type = e_model::MODEL_410M; break;
4437
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4438
+ default: model.type = e_model::MODEL_UNKNOWN;
4439
+ } break;
4440
+ case 32:
4441
+ switch (hparams.n_ff) {
4442
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4443
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4444
+ default: model.type = e_model::MODEL_UNKNOWN;
4445
+ } break;
4446
+ case 36:
4447
+ switch (hparams.n_ff) {
4448
+ case 20480: model.type = e_model::MODEL_12B; break;
4449
+ default: model.type = e_model::MODEL_UNKNOWN;
4450
+ } break;
4451
+ case 44:
4452
+ switch (hparams.n_ff) {
4453
+ case 24576: model.type = e_model::MODEL_20B; break;
4454
+ default: model.type = e_model::MODEL_UNKNOWN;
4455
+ } break;
4456
+ default: model.type = e_model::MODEL_UNKNOWN;
4457
+ }
4458
+ } break;
4459
+ case LLM_ARCH_ARCTIC:
4460
+ {
4461
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4462
+
4463
+ if (hparams.n_expert == 128) {
4464
+ switch (hparams.n_layer) {
4465
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4466
+ default: model.type = e_model::MODEL_UNKNOWN;
4467
+ }
4468
+ } else {
4469
+ model.type = e_model::MODEL_UNKNOWN;
4470
+ }
4471
+ } break;
4472
+ case LLM_ARCH_DEEPSEEK2:
4473
+ {
4474
+ bool is_lite = (hparams.n_layer == 27);
4475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
4477
+ if (!is_lite) {
4478
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
4479
+ }
4480
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
4481
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
4482
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
4483
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
4484
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
4485
+
4486
+ switch (hparams.n_layer) {
4487
+ case 27: model.type = e_model::MODEL_16B; break;
4488
+ case 60: model.type = e_model::MODEL_236B; break;
4489
+ default: model.type = e_model::MODEL_UNKNOWN;
4490
+ }
4491
+ } break;
4264
4492
  default: (void)0;
4265
4493
  }
4266
4494
 
@@ -4367,15 +4595,14 @@ static void llm_load_vocab(
4367
4595
  vocab.special_cls_id = 101;
4368
4596
  vocab.special_mask_id = 103;
4369
4597
  vocab.add_space_prefix = false;
4370
- } else {
4371
- if (tokenizer_model == "gpt2") {
4372
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4373
- } else {
4374
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4375
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4376
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4377
- return;
4598
+ } else if (tokenizer_model == "gpt2") {
4599
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4600
+
4601
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4602
+ if (add_space_prefix_keyidx != -1) {
4603
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4378
4604
  }
4605
+
4379
4606
  // read bpe merges and populate bpe ranks
4380
4607
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4381
4608
  if (merges_keyidx == -1) {
@@ -4409,6 +4636,8 @@ static void llm_load_vocab(
4409
4636
  vocab.special_pad_id = -1;
4410
4637
  vocab.special_cls_id = -1;
4411
4638
  vocab.special_mask_id = -1;
4639
+ } else {
4640
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4412
4641
  }
4413
4642
 
4414
4643
  // for now, only BPE models have pre-tokenizers
@@ -4461,12 +4690,18 @@ static void llm_load_vocab(
4461
4690
  } else if (
4462
4691
  tokenizer_pre == "qwen2") {
4463
4692
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4693
+ } else if (
4694
+ tokenizer_pre == "stablelm2") {
4695
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4464
4696
  } else if (
4465
4697
  tokenizer_pre == "olmo") {
4466
4698
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4467
4699
  } else if (
4468
4700
  tokenizer_pre == "dbrx") {
4469
4701
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4702
+ } else if (
4703
+ tokenizer_pre == "smaug-bpe") {
4704
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4470
4705
  } else {
4471
4706
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4472
4707
  }
@@ -4582,7 +4817,8 @@ static void llm_load_vocab(
4582
4817
  (t.first == "<|eot_id|>" ||
4583
4818
  t.first == "<|im_end|>" ||
4584
4819
  t.first == "<|end|>" ||
4585
- t.first == "<end_of_turn>"
4820
+ t.first == "<end_of_turn>" ||
4821
+ t.first == "<|endoftext|>"
4586
4822
  )
4587
4823
  ) {
4588
4824
  vocab.special_eot_id = t.second;
@@ -4594,97 +4830,40 @@ static void llm_load_vocab(
4594
4830
 
4595
4831
  // build special tokens cache
4596
4832
  {
4597
- // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
4598
- // and will always be correctly labeled in 'added_tokens.json' etc.
4599
- // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
4600
- // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
4601
- // are special tokens.
4602
- // From testing, this appears to correlate 1:1 with special tokens.
4603
- //
4604
-
4605
- // Counting special tokens and verifying in only one direction
4606
- // is sufficient to detect difference in those two sets.
4607
- //
4608
- uint32_t special_tokens_count_by_type = 0;
4609
- uint32_t special_tokens_count_from_verification = 0;
4610
-
4611
- bool special_tokens_definition_mismatch = false;
4612
-
4613
- for (const auto & t : vocab.token_to_id) {
4614
- const auto & token = t.first;
4615
- const auto & id = t.second;
4616
-
4617
- // Count all non-normal tokens in the vocab while iterating
4833
+ for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4618
4834
  if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4619
- special_tokens_count_by_type++;
4835
+ vocab.cache_special_tokens.push_back(id);
4620
4836
  }
4837
+ }
4621
4838
 
4622
- // Skip single character tokens
4623
- if (token.length() > 1) {
4624
- bool is_tokenizable = false;
4625
-
4626
- // Split token string representation in two, in all possible ways
4627
- // and check if both halves can be matched to a valid token
4628
- for (unsigned i = 1; i < token.length();) {
4629
- const auto left = token.substr(0, i);
4630
- const auto right = token.substr(i);
4631
-
4632
- // check if we didnt partition in the middle of a utf sequence
4633
- auto utf = utf8_len(left.at(left.length() - 1));
4634
-
4635
- if (utf == 1) {
4636
- if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
4637
- vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
4638
- is_tokenizable = true;
4639
- break;
4640
- }
4641
- i++;
4642
- } else {
4643
- // skip over the rest of multibyte utf sequence
4644
- i += utf - 1;
4645
- }
4646
- }
4839
+ std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
4840
+ [&] (const llama_vocab::id a, const llama_vocab::id b) {
4841
+ return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
4842
+ }
4843
+ );
4647
4844
 
4648
- if (!is_tokenizable) {
4649
- // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
4650
- // it's faster to re-filter them here, since there are way less candidates now
4845
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
+ }
4651
4847
 
4652
- // Calculate a total "utf" length of a token string representation
4653
- size_t utf8_str_len = 0;
4654
- for (unsigned i = 0; i < token.length();) {
4655
- utf8_str_len++;
4656
- i += utf8_len(token.at(i));
4657
- }
4848
+ // build token to piece caches
4849
+ {
4850
+ size_t size_cache = 0;
4658
4851
 
4659
- // And skip the ones which are one character
4660
- if (utf8_str_len > 1) {
4661
- // At this point what we have left are special tokens only
4662
- vocab.special_tokens_cache[token] = id;
4852
+ std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
+ std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4663
4854
 
4664
- // Count manually found special tokens
4665
- special_tokens_count_from_verification++;
4855
+ for (uint32_t id = 0; id < n_vocab; ++id) {
4856
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
+ cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4666
4858
 
4667
- // If this manually found special token is not marked as such, flag a mismatch
4668
- if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
4669
- special_tokens_definition_mismatch = true;
4670
- }
4671
- }
4672
- }
4673
- }
4859
+ size_cache += cache_token_to_piece[id].size();
4860
+ size_cache += cache_token_to_piece_special[id].size();
4674
4861
  }
4675
4862
 
4676
- if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
4677
- LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
4678
- __func__,
4679
- special_tokens_count_from_verification, vocab.id_to_token.size(),
4680
- special_tokens_count_by_type, vocab.id_to_token.size()
4681
- );
4682
- } else {
4683
- LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
4684
- __func__,
4685
- special_tokens_count_from_verification, vocab.id_to_token.size()
4686
- );
4687
- }
4863
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
+ std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4865
+
4866
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4688
4867
  }
4689
4868
  }
4690
4869
 
@@ -4765,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4765
4944
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4766
4945
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4767
4946
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4947
+
4948
+ if (model.arch == LLM_ARCH_DEEPSEEK2) {
4949
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4950
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
4951
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4952
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4953
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4954
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4955
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
4956
+ }
4768
4957
  }
4769
4958
 
4770
4959
  // Returns false if cancelled by progress_callback
@@ -4908,6 +5097,7 @@ static bool llm_load_tensors(
4908
5097
  // create tensors for the weights
4909
5098
  {
4910
5099
  const int64_t n_embd = hparams.n_embd;
5100
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4911
5101
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4912
5102
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4913
5103
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4920,8 +5110,6 @@ static bool llm_load_tensors(
4920
5110
  throw std::runtime_error("model has expert layers but no expert layers are used");
4921
5111
  }
4922
5112
 
4923
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4924
-
4925
5113
  ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
4926
5114
  ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
4927
5115
  ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@@ -4942,12 +5130,10 @@ static bool llm_load_tensors(
4942
5130
  {
4943
5131
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4944
5132
  if (model.arch != LLM_ARCH_MINICPM){
4945
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5133
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4946
5134
  // if output is NULL, init from the input tok embed
4947
5135
  if (model.output == NULL) {
4948
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4949
- ml.n_created--; // artificial tensor
4950
- ml.size_data += ggml_nbytes(model.output);
5136
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4951
5137
  }
4952
5138
  }
4953
5139
  }
@@ -4966,10 +5152,10 @@ static bool llm_load_tensors(
4966
5152
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4967
5153
 
4968
5154
  // optional bias tensors
4969
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4970
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4971
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4972
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5155
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5157
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5158
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4973
5159
 
4974
5160
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4975
5161
 
@@ -4977,10 +5163,15 @@ static bool llm_load_tensors(
4977
5163
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4978
5164
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4979
5165
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5166
+
5167
+ // optional MLP bias
5168
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5169
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5170
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
4980
5171
  } else {
4981
5172
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4982
5173
 
4983
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5174
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4984
5175
  if (layer.ffn_gate_exps) {
4985
5176
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4986
5177
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5022,12 +5213,10 @@ static bool llm_load_tensors(
5022
5213
  // output
5023
5214
  {
5024
5215
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5025
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5216
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5026
5217
  // if output is NULL, init from the input tok embed
5027
5218
  if (model.output == NULL) {
5028
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5029
- ml.n_created--; // artificial tensor
5030
- ml.size_data += ggml_nbytes(model.output);
5219
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5031
5220
  }
5032
5221
  }
5033
5222
 
@@ -5050,7 +5239,7 @@ static bool llm_load_tensors(
5050
5239
 
5051
5240
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5052
5241
 
5053
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5242
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
5054
5243
  if (layer.ffn_gate_exps) {
5055
5244
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
5056
5245
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5152,11 +5341,9 @@ static bool llm_load_tensors(
5152
5341
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5153
5342
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5154
5343
 
5155
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5344
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
5345
  if (!model.output) {
5157
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5158
- ml.n_created--; // artificial tensor
5159
- ml.size_data += ggml_nbytes(model.output);
5346
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5160
5347
  }
5161
5348
  }
5162
5349
 
@@ -5169,8 +5356,8 @@ static bool llm_load_tensors(
5169
5356
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5170
5357
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5171
5358
 
5172
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5173
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5359
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5360
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5174
5361
 
5175
5362
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5176
5363
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5188,7 +5375,12 @@ static bool llm_load_tensors(
5188
5375
  {
5189
5376
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5190
5377
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5191
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5378
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5379
+ if (!model.output) {
5380
+ // needs to be on GPU
5381
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5382
+ }
5383
+
5192
5384
  }
5193
5385
 
5194
5386
  for (int i = 0; i < n_layer; ++i) {
@@ -5216,47 +5408,6 @@ static bool llm_load_tensors(
5216
5408
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5217
5409
  }
5218
5410
  } break;
5219
- case LLM_ARCH_PERSIMMON:
5220
- {
5221
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5222
-
5223
- {
5224
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5225
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5226
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5227
- }
5228
-
5229
- for (int i = 0; i < n_layer; ++i) {
5230
- ggml_context * ctx_layer = ctx_for_layer(i);
5231
- ggml_context * ctx_split = ctx_for_layer_split(i);
5232
-
5233
- auto & layer = model.layers[i];
5234
-
5235
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5236
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5237
-
5238
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5239
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5240
-
5241
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5242
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5243
-
5244
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5245
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5246
-
5247
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5248
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5249
-
5250
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5251
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5252
-
5253
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5254
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5255
-
5256
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5257
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5258
- }
5259
- } break;
5260
5411
  case LLM_ARCH_BERT:
5261
5412
  case LLM_ARCH_NOMIC_BERT:
5262
5413
  {
@@ -5325,14 +5476,14 @@ static bool llm_load_tensors(
5325
5476
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
5477
  layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
5478
 
5328
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5479
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5480
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5481
 
5331
5482
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
5483
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
5484
 
5334
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5485
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5486
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5336
5487
 
5337
5488
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
5489
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
@@ -5394,18 +5545,16 @@ static bool llm_load_tensors(
5394
5545
  case LLM_ARCH_MPT:
5395
5546
  {
5396
5547
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5397
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5548
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5398
5549
 
5399
5550
  // output
5400
5551
  {
5401
5552
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5402
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5553
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5403
5554
 
5404
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5555
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5405
5556
  if (!model.output) {
5406
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5407
- ml.n_created--; // artificial tensor
5408
- ml.size_data += ggml_nbytes(model.output);
5557
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5409
5558
  }
5410
5559
  }
5411
5560
 
@@ -5416,31 +5565,31 @@ static bool llm_load_tensors(
5416
5565
  auto & layer = model.layers[i];
5417
5566
 
5418
5567
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5419
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5568
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5420
5569
 
5421
5570
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5422
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5571
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5423
5572
 
5424
5573
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5425
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5574
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5426
5575
 
5427
5576
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5428
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5577
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5429
5578
 
5430
5579
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5431
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5580
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5432
5581
 
5433
5582
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5434
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5583
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5435
5584
 
5436
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5437
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5585
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5586
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5438
5587
 
5439
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5440
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5588
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5589
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5441
5590
 
5442
5591
  // AWQ ScaleActivation layer
5443
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5592
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5444
5593
  }
5445
5594
  } break;
5446
5595
  case LLM_ARCH_STABLELM:
@@ -5469,17 +5618,17 @@ static bool llm_load_tensors(
5469
5618
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5470
5619
 
5471
5620
  // optional bias tensors, present in Stable LM 2 1.6B
5472
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5473
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5474
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5621
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5622
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5623
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5475
5624
 
5476
5625
  // optional q and k layernorms, present in StableLM 2 12B
5477
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5478
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5626
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5627
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5479
5628
 
5480
5629
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5481
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5482
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5630
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5631
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5483
5632
 
5484
5633
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5485
5634
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5522,12 +5671,10 @@ static bool llm_load_tensors(
5522
5671
  // output
5523
5672
  {
5524
5673
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5525
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5674
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5526
5675
  // if output is NULL, init from the input tok embed
5527
5676
  if (model.output == NULL) {
5528
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5529
- ml.n_created--; // artificial tensor
5530
- ml.size_data += ggml_nbytes(model.output);
5677
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5531
5678
  }
5532
5679
  }
5533
5680
 
@@ -5625,8 +5772,8 @@ static bool llm_load_tensors(
5625
5772
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5626
5773
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5627
5774
 
5628
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5629
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5775
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5776
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5630
5777
 
5631
5778
  if (layer.wqkv == nullptr) {
5632
5779
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5663,17 +5810,20 @@ static bool llm_load_tensors(
5663
5810
  ggml_context* ctx_layer = ctx_for_layer(i);
5664
5811
  ggml_context* ctx_split = ctx_for_layer_split(i);
5665
5812
 
5666
- auto& layer = model.layers[i];
5813
+ auto & layer = model.layers[i];
5667
5814
 
5668
5815
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5669
5816
 
5670
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5671
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5817
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5818
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5672
5819
 
5673
5820
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5674
5821
 
5675
5822
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5676
5823
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5824
+
5825
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5826
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5677
5827
  }
5678
5828
  } break;
5679
5829
  case LLM_ARCH_PLAMO:
@@ -5842,9 +5992,7 @@ static bool llm_load_tensors(
5842
5992
 
5843
5993
  // output
5844
5994
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5845
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5846
- ml.n_created--; // artificial tensor
5847
- ml.size_data += ggml_nbytes(model.output);
5995
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5848
5996
 
5849
5997
  const int64_t n_ff = hparams.n_ff;
5850
5998
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5879,12 +6027,10 @@ static bool llm_load_tensors(
5879
6027
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5880
6028
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5881
6029
 
5882
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6030
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5883
6031
  // if output is NULL, init from the input tok embed
5884
6032
  if (model.output == NULL) {
5885
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5886
- ml.n_created--; // artificial tensor
5887
- ml.size_data += ggml_nbytes(model.output);
6033
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5888
6034
  }
5889
6035
 
5890
6036
  }
@@ -5935,12 +6081,10 @@ static bool llm_load_tensors(
5935
6081
  {
5936
6082
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5937
6083
 
5938
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6084
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5939
6085
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5940
6086
  if (model.output == NULL) {
5941
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5942
- ml.n_created--; // artificial tensor
5943
- ml.size_data += ggml_nbytes(model.output);
6087
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5944
6088
  }
5945
6089
  }
5946
6090
 
@@ -6001,9 +6145,7 @@ static bool llm_load_tensors(
6001
6145
  {
6002
6146
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6003
6147
  // init output from the input tok embed
6004
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6005
- ml.n_created--; // artificial tensor
6006
- ml.size_data += ggml_nbytes(model.output);
6148
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6007
6149
  }
6008
6150
 
6009
6151
  for (int i = 0; i < n_layer; ++i) {
@@ -6035,12 +6177,10 @@ static bool llm_load_tensors(
6035
6177
 
6036
6178
  // output
6037
6179
  {
6038
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6180
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6039
6181
  // if output is NULL, init from the input tok embed
6040
6182
  if (model.output == NULL) {
6041
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6042
- ml.n_created--; // artificial tensor
6043
- ml.size_data += ggml_nbytes(model.output);
6183
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6044
6184
  }
6045
6185
  }
6046
6186
 
@@ -6060,30 +6200,169 @@ static bool llm_load_tensors(
6060
6200
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6061
6201
  }
6062
6202
  } break;
6063
- default:
6064
- throw std::runtime_error("unknown architecture");
6065
- }
6066
- }
6203
+ case LLM_ARCH_GPTNEOX:
6204
+ {
6205
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6206
+ // output
6207
+ {
6208
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6209
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6210
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6211
+ }
6067
6212
 
6068
- ml.done_getting_tensors();
6213
+ for (int i = 0; i < n_layer; ++i) {
6214
+ ggml_context * ctx_layer = ctx_for_layer(i);
6215
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6069
6216
 
6070
- ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
6071
- model.mappings.reserve(ml.mappings.size());
6217
+ auto & layer = model.layers[i];
6072
6218
 
6073
- // create the backend buffers
6074
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6075
- ctx_bufs.reserve(ctx_map.size());
6219
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6220
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6076
6221
 
6077
- // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6078
- size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6079
- model.bufs.reserve(n_max_backend_buffer);
6222
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6223
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6080
6224
 
6081
- for (auto & it : ctx_map) {
6082
- ggml_backend_buffer_type_t buft = it.first;
6083
- ggml_context * ctx = it.second;
6225
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6226
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6084
6227
 
6085
- llama_buf_map bufs;
6086
- bufs.reserve(n_max_backend_buffer);
6228
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6229
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6230
+
6231
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6232
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6233
+
6234
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6235
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6236
+ }
6237
+ } break;
6238
+ case LLM_ARCH_ARCTIC:
6239
+ {
6240
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6241
+
6242
+ // output
6243
+ {
6244
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6245
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6246
+ // if output is NULL, init from the input tok embed
6247
+ if (model.output == NULL) {
6248
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6249
+ }
6250
+ }
6251
+
6252
+ for (int i = 0; i < n_layer; ++i) {
6253
+ ggml_context * ctx_layer = ctx_for_layer(i);
6254
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6255
+
6256
+ auto & layer = model.layers[i];
6257
+
6258
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6259
+
6260
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6261
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6262
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6263
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6264
+
6265
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6266
+
6267
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6268
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6269
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6270
+
6271
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6272
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6273
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6274
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6275
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6276
+ }
6277
+ } break;
6278
+ case LLM_ARCH_DEEPSEEK2:
6279
+ {
6280
+ bool is_lite = (hparams.n_layer == 27);
6281
+
6282
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
6283
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6284
+ const uint32_t q_lora_rank = hparams.n_lora_q;
6285
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
6286
+ const uint32_t n_ff_exp = hparams.n_ff_exp;
6287
+
6288
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6289
+
6290
+ // output
6291
+ {
6292
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6293
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6294
+ }
6295
+
6296
+ for (int i = 0; i < n_layer; ++i) {
6297
+ ggml_context * ctx_layer = ctx_for_layer(i);
6298
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6299
+
6300
+ auto & layer = model.layers[i];
6301
+
6302
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6303
+ if (!is_lite) {
6304
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
6305
+ }
6306
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
6307
+
6308
+ if (!is_lite) {
6309
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
6310
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
6311
+ } else {
6312
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
6313
+ }
6314
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
6315
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
6316
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
6317
+
6318
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6319
+
6320
+ if ((uint32_t) i < hparams.n_layer_dense_lead) {
6321
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6322
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
6323
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6324
+ } else {
6325
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6326
+
6327
+ GGML_ASSERT(hparams.n_expert > 0);
6328
+ GGML_ASSERT(hparams.n_expert_used > 0);
6329
+
6330
+ // MoE branch
6331
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6332
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6333
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6334
+
6335
+ // Shared expert branch
6336
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6337
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
6338
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6339
+ }
6340
+ }
6341
+ } break;
6342
+ default:
6343
+ throw std::runtime_error("unknown architecture");
6344
+ }
6345
+ }
6346
+
6347
+ ml.done_getting_tensors();
6348
+
6349
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
6350
+ model.mappings.reserve(ml.mappings.size());
6351
+
6352
+ // create the backend buffers
6353
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6354
+ ctx_bufs.reserve(ctx_map.size());
6355
+
6356
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6357
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6358
+ model.bufs.reserve(n_max_backend_buffer);
6359
+
6360
+ for (auto & it : ctx_map) {
6361
+ ggml_backend_buffer_type_t buft = it.first;
6362
+ ggml_context * ctx = it.second;
6363
+
6364
+ llama_buf_map bufs;
6365
+ bufs.reserve(n_max_backend_buffer);
6087
6366
 
6088
6367
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
6089
6368
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -6324,10 +6603,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6324
6603
 
6325
6604
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6326
6605
  } else {
6327
- #ifdef GGML_USE_MPI
6328
- GGML_ASSERT(false && "not implemented");
6329
- #endif
6330
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6606
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6331
6607
  inpL = lctx.inp_embd;
6332
6608
  ggml_set_input(lctx.inp_embd);
6333
6609
  }
@@ -6517,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
6517
6793
  int64_t n_expert_used,
6518
6794
  llm_ffn_op_type type_op,
6519
6795
  bool norm_w,
6796
+ bool scale_w,
6797
+ float w_scale,
6520
6798
  const llm_build_cb & cb,
6521
6799
  int il) {
6522
6800
  int64_t n_embd = cur->ne[0];
@@ -6548,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
6548
6826
 
6549
6827
  weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6550
6828
  }
6829
+ if (scale_w) {
6830
+ weights = ggml_scale(ctx, weights, w_scale);
6831
+ cb(weights, "ffn_moe_weights_scaled", il);
6832
+ }
6551
6833
 
6552
6834
  cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6553
6835
  ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -6652,7 +6934,7 @@ static struct ggml_tensor * llm_build_kqv(
6652
6934
 
6653
6935
  cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6654
6936
 
6655
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6937
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6656
6938
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6657
6939
  }
6658
6940
 
@@ -6661,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
6661
6943
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6662
6944
  cb(kq, "kq", il);
6663
6945
 
6664
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6946
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6665
6947
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6666
6948
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6667
6949
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6886,17 +7168,20 @@ struct llm_build_context {
6886
7168
  cb(lctx.inp_K_shift, "K_shift", -1);
6887
7169
  ggml_set_input(lctx.inp_K_shift);
6888
7170
 
7171
+
6889
7172
  for (int il = 0; il < n_layer; ++il) {
7173
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6890
7174
  struct ggml_tensor * tmp =
6891
7175
  // we rotate only the first n_rot dimensions
6892
- ggml_rope_custom_inplace(ctx0,
7176
+ ggml_rope_ext_inplace(ctx0,
6893
7177
  ggml_view_3d(ctx0, kv_self.k_l[il],
6894
7178
  n_embd_head_k, n_head_kv, n_ctx,
6895
7179
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6896
7180
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6897
7181
  0),
6898
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7182
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6899
7183
  ext_factor, attn_factor, beta_fast, beta_slow);
7184
+
6900
7185
  cb(tmp, "K_shifted", il);
6901
7186
  ggml_build_forward_expand(gf, tmp);
6902
7187
  }
@@ -6999,6 +7284,17 @@ struct llm_build_context {
6999
7284
  return lctx.inp_pos;
7000
7285
  }
7001
7286
 
7287
+ struct ggml_tensor * build_rope_factors(int il) {
7288
+ // choose long/short freq factors based on the context size
7289
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7290
+
7291
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7292
+ return model.layers[il].rope_long;
7293
+ }
7294
+
7295
+ return model.layers[il].rope_short;
7296
+ }
7297
+
7002
7298
  struct ggml_tensor * build_inp_out_ids() {
7003
7299
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
7004
7300
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -7106,15 +7402,15 @@ struct llm_build_context {
7106
7402
  cb(Vcur, "Vcur", il);
7107
7403
  }
7108
7404
 
7109
- Qcur = ggml_rope_custom(
7110
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7405
+ Qcur = ggml_rope_ext(
7406
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7111
7407
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7112
7408
  ext_factor, attn_factor, beta_fast, beta_slow
7113
7409
  );
7114
7410
  cb(Qcur, "Qcur", il);
7115
7411
 
7116
- Kcur = ggml_rope_custom(
7117
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7412
+ Kcur = ggml_rope_ext(
7413
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7118
7414
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7119
7415
  ext_factor, attn_factor, beta_fast, beta_slow
7120
7416
  );
@@ -7144,9 +7440,9 @@ struct llm_build_context {
7144
7440
  cb(cur, "ffn_norm", il);
7145
7441
 
7146
7442
  cur = llm_build_ffn(ctx0, cur,
7147
- model.layers[il].ffn_up, NULL,
7148
- model.layers[il].ffn_gate, NULL,
7149
- model.layers[il].ffn_down, NULL,
7443
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7444
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7445
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7150
7446
  NULL,
7151
7447
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7152
7448
  cb(cur, "ffn_out", il);
@@ -7164,6 +7460,7 @@ struct llm_build_context {
7164
7460
  model.layers[il].ffn_down_exps,
7165
7461
  n_expert, n_expert_used,
7166
7462
  LLM_FFN_SILU, true,
7463
+ false, 0.0,
7167
7464
  cb, il);
7168
7465
  cb(cur, "ffn_moe_out", il);
7169
7466
  }
@@ -7236,13 +7533,13 @@ struct llm_build_context {
7236
7533
 
7237
7534
  switch (model.type) {
7238
7535
  case MODEL_7B:
7239
- Qcur = ggml_rope_custom(
7240
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7536
+ Qcur = ggml_rope_ext(
7537
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7241
7538
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7242
7539
  ext_factor, attn_factor, beta_fast, beta_slow
7243
7540
  );
7244
- Kcur = ggml_rope_custom(
7245
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7541
+ Kcur = ggml_rope_ext(
7542
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7246
7543
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7544
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7545
  );
@@ -7348,15 +7645,15 @@ struct llm_build_context {
7348
7645
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7349
7646
  cb(Vcur, "Vcur", il);
7350
7647
 
7351
- Qcur = ggml_rope_custom(
7352
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7648
+ Qcur = ggml_rope_ext(
7649
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7353
7650
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7354
7651
  ext_factor, attn_factor, beta_fast, beta_slow
7355
7652
  );
7356
7653
  cb(Qcur, "Qcur", il);
7357
7654
 
7358
- Kcur = ggml_rope_custom(
7359
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7655
+ Kcur = ggml_rope_ext(
7656
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7360
7657
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7361
7658
  ext_factor, attn_factor, beta_fast, beta_slow
7362
7659
  );
@@ -7469,14 +7766,14 @@ struct llm_build_context {
7469
7766
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7470
7767
 
7471
7768
  // using mode = 2 for neox mode
7472
- Qcur = ggml_rope_custom(
7473
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7769
+ Qcur = ggml_rope_ext(
7770
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7474
7771
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7475
7772
  );
7476
7773
  cb(Qcur, "Qcur", il);
7477
7774
 
7478
- Kcur = ggml_rope_custom(
7479
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7775
+ Kcur = ggml_rope_ext(
7776
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7480
7777
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7481
7778
  );
7482
7779
  cb(Kcur, "Kcur", il);
@@ -7592,15 +7889,15 @@ struct llm_build_context {
7592
7889
  cb(Vcur, "Vcur", il);
7593
7890
  }
7594
7891
 
7595
- Qcur = ggml_rope_custom(
7596
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7892
+ Qcur = ggml_rope_ext(
7893
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7597
7894
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7598
7895
  ext_factor, attn_factor, beta_fast, beta_slow
7599
7896
  );
7600
7897
  cb(Qcur, "Qcur", il);
7601
7898
 
7602
- Kcur = ggml_rope_custom(
7603
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7899
+ Kcur = ggml_rope_ext(
7900
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7604
7901
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
7902
  ext_factor, attn_factor, beta_fast, beta_slow
7606
7903
  );
@@ -7645,6 +7942,7 @@ struct llm_build_context {
7645
7942
  model.layers[il].ffn_down_exps,
7646
7943
  n_expert, n_expert_used,
7647
7944
  LLM_FFN_GELU, true,
7945
+ false, 0.0,
7648
7946
  cb, il);
7649
7947
  cb(cur, "ffn_moe_out", il);
7650
7948
 
@@ -7744,15 +8042,15 @@ struct llm_build_context {
7744
8042
  cb(Kcur, "Kcur", il);
7745
8043
  cb(Vcur, "Vcur", il);
7746
8044
 
7747
- Qcur = ggml_rope_custom(
7748
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8045
+ Qcur = ggml_rope_ext(
8046
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7749
8047
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7750
8048
  ext_factor, attn_factor, beta_fast, beta_slow
7751
8049
  );
7752
8050
  cb(Qcur, "Qcur", il);
7753
8051
 
7754
- Kcur = ggml_rope_custom(
7755
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8052
+ Kcur = ggml_rope_ext(
8053
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7756
8054
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7757
8055
  ext_factor, attn_factor, beta_fast, beta_slow
7758
8056
  );
@@ -7788,6 +8086,7 @@ struct llm_build_context {
7788
8086
  model.layers[il].ffn_down_exps,
7789
8087
  n_expert, n_expert_used,
7790
8088
  LLM_FFN_SILU, true,
8089
+ false, 0.0,
7791
8090
  cb, il);
7792
8091
  cb(cur, "ffn_moe_out", il);
7793
8092
 
@@ -7921,213 +8220,6 @@ struct llm_build_context {
7921
8220
  return gf;
7922
8221
  }
7923
8222
 
7924
- struct ggml_cgraph * build_persimmon() {
7925
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7926
-
7927
- const int64_t n_embd_head = hparams.n_embd_head_v;
7928
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7929
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
7930
-
7931
- struct ggml_tensor * cur;
7932
- struct ggml_tensor * inpL;
7933
-
7934
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7935
-
7936
- // inp_pos - contains the positions
7937
- struct ggml_tensor * inp_pos = build_inp_pos();
7938
-
7939
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7940
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7941
-
7942
- for (int il = 0; il < n_layer; ++il) {
7943
- struct ggml_tensor * residual = inpL;
7944
-
7945
- cur = llm_build_norm(ctx0, inpL, hparams,
7946
- model.layers[il].attn_norm,
7947
- model.layers[il].attn_norm_b,
7948
- LLM_NORM, cb, il);
7949
- cb(cur, "attn_norm", il);
7950
-
7951
- // self attention
7952
- {
7953
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7954
- cb(cur, "wqkv", il);
7955
-
7956
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7957
- cb(cur, "bqkv", il);
7958
-
7959
- // split qkv
7960
- GGML_ASSERT(n_head_kv == n_head);
7961
-
7962
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7963
- cb(tmpqkv, "tmpqkv", il);
7964
-
7965
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7966
- cb(tmpqkv_perm, "tmpqkv", il);
7967
-
7968
- struct ggml_tensor * tmpq = ggml_view_3d(
7969
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7970
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7971
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7972
- 0
7973
- );
7974
- cb(tmpq, "tmpq", il);
7975
-
7976
- struct ggml_tensor * tmpk = ggml_view_3d(
7977
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7978
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7979
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7980
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7981
- );
7982
- cb(tmpk, "tmpk", il);
7983
-
7984
- // Q/K Layernorm
7985
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7986
- model.layers[il].attn_q_norm,
7987
- model.layers[il].attn_q_norm_b,
7988
- LLM_NORM, cb, il);
7989
- cb(tmpq, "tmpq", il);
7990
-
7991
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7992
- model.layers[il].attn_k_norm,
7993
- model.layers[il].attn_k_norm_b,
7994
- LLM_NORM, cb, il);
7995
- cb(tmpk, "tmpk", il);
7996
-
7997
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7998
- struct ggml_tensor * qrot = ggml_view_3d(
7999
- ctx0, tmpq, n_rot, n_head, n_tokens,
8000
- ggml_element_size(tmpq) * n_embd_head,
8001
- ggml_element_size(tmpq) * n_embd_head * n_head,
8002
- 0
8003
- );
8004
- cb(qrot, "qrot", il);
8005
-
8006
- struct ggml_tensor * krot = ggml_view_3d(
8007
- ctx0, tmpk, n_rot, n_head, n_tokens,
8008
- ggml_element_size(tmpk) * n_embd_head,
8009
- ggml_element_size(tmpk) * n_embd_head * n_head,
8010
- 0
8011
- );
8012
- cb(krot, "krot", il);
8013
-
8014
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
8015
- struct ggml_tensor * qpass = ggml_view_3d(
8016
- ctx0, tmpq, n_rot, n_head, n_tokens,
8017
- ggml_element_size(tmpq) * n_embd_head,
8018
- ggml_element_size(tmpq) * n_embd_head * n_head,
8019
- ggml_element_size(tmpq) * n_rot
8020
- );
8021
- cb(qpass, "qpass", il);
8022
-
8023
- struct ggml_tensor * kpass = ggml_view_3d(
8024
- ctx0, tmpk, n_rot, n_head, n_tokens,
8025
- ggml_element_size(tmpk) * n_embd_head,
8026
- ggml_element_size(tmpk) * n_embd_head * n_head,
8027
- ggml_element_size(tmpk) * n_rot
8028
- );
8029
- cb(kpass, "kpass", il);
8030
-
8031
- struct ggml_tensor * qrotated = ggml_rope_custom(
8032
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8033
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8034
- );
8035
- cb(qrotated, "qrotated", il);
8036
-
8037
- struct ggml_tensor * krotated = ggml_rope_custom(
8038
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8039
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8040
- );
8041
- cb(krotated, "krotated", il);
8042
-
8043
- // ggml currently only supports concatenation on dim=2
8044
- // so we need to permute qrot, qpass, concat, then permute back.
8045
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
8046
- cb(qrotated, "qrotated", il);
8047
-
8048
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
8049
- cb(krotated, "krotated", il);
8050
-
8051
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
8052
- cb(qpass, "qpass", il);
8053
-
8054
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
8055
- cb(kpass, "kpass", il);
8056
-
8057
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
8058
- cb(Qcur, "Qcur", il);
8059
-
8060
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
8061
- cb(Kcur, "Kcur", il);
8062
-
8063
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
8064
- cb(Q, "Q", il);
8065
-
8066
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8067
- cb(Kcur, "Kcur", il);
8068
-
8069
- struct ggml_tensor * Vcur = ggml_view_3d(
8070
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8071
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8072
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8073
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8074
- );
8075
- cb(Vcur, "Vcur", il);
8076
-
8077
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8078
- model.layers[il].wo, model.layers[il].bo,
8079
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8080
- }
8081
-
8082
- if (il == n_layer - 1) {
8083
- // skip computing output for unused tokens
8084
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8085
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8086
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8087
- }
8088
-
8089
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8090
- cb(ffn_inp, "ffn_inp", il);
8091
-
8092
- // feed-forward network
8093
- {
8094
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8095
- model.layers[il].ffn_norm,
8096
- model.layers[il].ffn_norm_b,
8097
- LLM_NORM, cb, il);
8098
- cb(cur, "ffn_norm", il);
8099
-
8100
- cur = llm_build_ffn(ctx0, cur,
8101
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8102
- NULL, NULL,
8103
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8104
- NULL,
8105
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8106
- cb(cur, "ffn_out", il);
8107
- }
8108
-
8109
- cur = ggml_add(ctx0, cur, ffn_inp);
8110
- cb(cur, "l_out", il);
8111
-
8112
- inpL = cur;
8113
- }
8114
-
8115
- cur = inpL;
8116
-
8117
- cur = llm_build_norm(ctx0, cur, hparams,
8118
- model.output_norm,
8119
- model.output_norm_b,
8120
- LLM_NORM, cb, -1);
8121
- cb(cur, "result_norm", -1);
8122
-
8123
- cur = ggml_mul_mat(ctx0, model.output, cur);
8124
- cb(cur, "result_output", -1);
8125
-
8126
- ggml_build_forward_expand(gf, cur);
8127
-
8128
- return gf;
8129
- }
8130
-
8131
8223
  struct ggml_cgraph * build_refact() {
8132
8224
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8133
8225
 
@@ -8304,15 +8396,15 @@ struct llm_build_context {
8304
8396
  cb(Kcur, "Kcur", il);
8305
8397
  cb(Vcur, "Vcur", il);
8306
8398
 
8307
- Qcur = ggml_rope_custom(
8308
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8399
+ Qcur = ggml_rope_ext(
8400
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8309
8401
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8310
8402
  ext_factor, attn_factor, beta_fast, beta_slow
8311
8403
  );
8312
8404
  cb(Qcur, "Qcur", il);
8313
8405
 
8314
- Kcur = ggml_rope_custom(
8315
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8406
+ Kcur = ggml_rope_ext(
8407
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8316
8408
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8317
8409
  ext_factor, attn_factor, beta_fast, beta_slow
8318
8410
  );
@@ -8744,15 +8836,15 @@ struct llm_build_context {
8744
8836
  }
8745
8837
 
8746
8838
 
8747
- Qcur = ggml_rope_custom(
8748
- ctx0, Qcur, inp_pos,
8839
+ Qcur = ggml_rope_ext(
8840
+ ctx0, Qcur, inp_pos, nullptr,
8749
8841
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8750
8842
  ext_factor, attn_factor, beta_fast, beta_slow
8751
8843
  );
8752
8844
  cb(Qcur, "Qcur", il);
8753
8845
 
8754
- Kcur = ggml_rope_custom(
8755
- ctx0, Kcur, inp_pos,
8846
+ Kcur = ggml_rope_ext(
8847
+ ctx0, Kcur, inp_pos, nullptr,
8756
8848
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8757
8849
  ext_factor, attn_factor, beta_fast, beta_slow
8758
8850
  );
@@ -8864,14 +8956,14 @@ struct llm_build_context {
8864
8956
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8865
8957
 
8866
8958
  // using mode = 2 for neox mode
8867
- Qcur = ggml_rope_custom(
8868
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8959
+ Qcur = ggml_rope_ext(
8960
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8869
8961
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8870
8962
  );
8871
8963
  cb(Qcur, "Qcur", il);
8872
8964
 
8873
- Kcur = ggml_rope_custom(
8874
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8965
+ Kcur = ggml_rope_ext(
8966
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8875
8967
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8876
8968
  );
8877
8969
  cb(Kcur, "Kcur", il);
@@ -8975,15 +9067,15 @@ struct llm_build_context {
8975
9067
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8976
9068
  cb(Vcur, "Vcur", il);
8977
9069
 
8978
- Qcur = ggml_rope_custom(
8979
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9070
+ Qcur = ggml_rope_ext(
9071
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8980
9072
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8981
9073
  ext_factor, attn_factor, beta_fast, beta_slow
8982
9074
  );
8983
9075
  cb(Qcur, "Qcur", il);
8984
9076
 
8985
- Kcur = ggml_rope_custom(
8986
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9077
+ Kcur = ggml_rope_ext(
9078
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8987
9079
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8988
9080
  ext_factor, attn_factor, beta_fast, beta_slow
8989
9081
  );
@@ -9089,15 +9181,15 @@ struct llm_build_context {
9089
9181
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9090
9182
  cb(Vcur, "Vcur", il);
9091
9183
 
9092
- Qcur = ggml_rope_custom(
9093
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9184
+ Qcur = ggml_rope_ext(
9185
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9094
9186
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9095
9187
  ext_factor, attn_factor, beta_fast, beta_slow
9096
9188
  );
9097
9189
  cb(Qcur, "Qcur", il);
9098
9190
 
9099
- Kcur = ggml_rope_custom(
9100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9191
+ Kcur = ggml_rope_ext(
9192
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9101
9193
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9102
9194
  ext_factor, attn_factor, beta_fast, beta_slow
9103
9195
  );
@@ -9133,6 +9225,7 @@ struct llm_build_context {
9133
9225
  model.layers[il].ffn_down_exps,
9134
9226
  n_expert, n_expert_used,
9135
9227
  LLM_FFN_SILU, false,
9228
+ false, 0.0,
9136
9229
  cb, il);
9137
9230
  cb(cur, "ffn_moe_out", il);
9138
9231
 
@@ -9241,8 +9334,8 @@ struct llm_build_context {
9241
9334
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9242
9335
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9243
9336
 
9244
- Qcur = ggml_rope_custom(
9245
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9337
+ Qcur = ggml_rope_ext(
9338
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9246
9339
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9247
9340
  );
9248
9341
  cb(Qcur, "Qcur", il);
@@ -9252,8 +9345,8 @@ struct llm_build_context {
9252
9345
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9253
9346
  cb(Qcur, "Qcur", il);
9254
9347
 
9255
- Kcur = ggml_rope_custom(
9256
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9348
+ Kcur = ggml_rope_ext(
9349
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9257
9350
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9258
9351
  );
9259
9352
  cb(Kcur, "Kcur", il);
@@ -9329,6 +9422,9 @@ struct llm_build_context {
9329
9422
 
9330
9423
  // self-attention
9331
9424
  {
9425
+ // rope freq factors for 128k context
9426
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9427
+
9332
9428
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9333
9429
  model.layers[il].attn_norm,
9334
9430
  NULL,
@@ -9360,8 +9456,8 @@ struct llm_build_context {
9360
9456
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9361
9457
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9362
9458
 
9363
- Qcur = ggml_rope_custom(
9364
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9459
+ Qcur = ggml_rope_ext(
9460
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9365
9461
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9366
9462
  );
9367
9463
  cb(Qcur, "Qcur", il);
@@ -9369,8 +9465,8 @@ struct llm_build_context {
9369
9465
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9370
9466
  cb(Qcur, "Qcur", il);
9371
9467
 
9372
- Kcur = ggml_rope_custom(
9373
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9468
+ Kcur = ggml_rope_ext(
9469
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9374
9470
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9375
9471
  );
9376
9472
  cb(Kcur, "Kcur", il);
@@ -9476,14 +9572,14 @@ struct llm_build_context {
9476
9572
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9477
9573
  cb(Vcur, "Vcur", il);
9478
9574
 
9479
- Qcur = ggml_rope_custom(
9480
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9575
+ Qcur = ggml_rope_ext(
9576
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9481
9577
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9482
9578
  ext_factor, attn_factor, beta_fast, beta_slow);
9483
9579
  cb(Qcur, "Qcur", il);
9484
9580
 
9485
- Kcur = ggml_rope_custom(
9486
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9581
+ Kcur = ggml_rope_ext(
9582
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9487
9583
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9488
9584
  ext_factor, attn_factor, beta_fast, beta_slow);
9489
9585
  cb(Kcur, "Kcur", il);
@@ -9684,15 +9780,15 @@ struct llm_build_context {
9684
9780
  cb(tmpk, "tmpk", il);
9685
9781
  cb(Vcur, "Vcur", il);
9686
9782
 
9687
- struct ggml_tensor * Qcur = ggml_rope_custom(
9688
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9783
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9784
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9689
9785
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9690
9786
  ext_factor, attn_factor, beta_fast, beta_slow
9691
9787
  );
9692
9788
  cb(Qcur, "Qcur", il);
9693
9789
 
9694
- struct ggml_tensor * Kcur = ggml_rope_custom(
9695
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9790
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9791
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9696
9792
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9697
9793
  ext_factor, attn_factor, beta_fast, beta_slow
9698
9794
  );
@@ -9800,15 +9896,15 @@ struct llm_build_context {
9800
9896
  // cb(Vcur, "Vcur", il);
9801
9897
  // }
9802
9898
 
9803
- Qcur = ggml_rope_custom(
9804
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9899
+ Qcur = ggml_rope_ext(
9900
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9805
9901
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9806
9902
  ext_factor, attn_factor, beta_fast, beta_slow
9807
9903
  );
9808
9904
  cb(Qcur, "Qcur", il);
9809
9905
 
9810
- Kcur = ggml_rope_custom(
9811
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9906
+ Kcur = ggml_rope_ext(
9907
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9812
9908
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9813
9909
  ext_factor, attn_factor, beta_fast, beta_slow
9814
9910
  );
@@ -9917,15 +10013,15 @@ struct llm_build_context {
9917
10013
  cb(Vcur, "Vcur", il);
9918
10014
  }
9919
10015
 
9920
- Qcur = ggml_rope_custom(
9921
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10016
+ Qcur = ggml_rope_ext(
10017
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9922
10018
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9923
10019
  ext_factor, attn_factor, beta_fast, beta_slow
9924
10020
  );
9925
10021
  cb(Qcur, "Qcur", il);
9926
10022
 
9927
- Kcur = ggml_rope_custom(
9928
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10023
+ Kcur = ggml_rope_ext(
10024
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9929
10025
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9930
10026
  ext_factor, attn_factor, beta_fast, beta_slow
9931
10027
  );
@@ -10047,15 +10143,15 @@ struct llm_build_context {
10047
10143
  cb(Vcur, "Vcur", il);
10048
10144
  }
10049
10145
 
10050
- Qcur = ggml_rope_custom(
10051
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10146
+ Qcur = ggml_rope_ext(
10147
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10052
10148
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10053
10149
  ext_factor, attn_factor, beta_fast, beta_slow
10054
10150
  );
10055
10151
  cb(Qcur, "Qcur", il);
10056
10152
 
10057
- Kcur = ggml_rope_custom(
10058
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10153
+ Kcur = ggml_rope_ext(
10154
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10059
10155
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10060
10156
  ext_factor, attn_factor, beta_fast, beta_slow
10061
10157
  );
@@ -10167,8 +10263,8 @@ struct llm_build_context {
10167
10263
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10168
10264
  cb(Vcur, "Vcur", il);
10169
10265
 
10170
- Qcur = ggml_rope_custom(
10171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10266
+ Qcur = ggml_rope_ext(
10267
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10172
10268
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10173
10269
  ext_factor, attn_factor, beta_fast, beta_slow);
10174
10270
  cb(Qcur, "Qcur", il);
@@ -10176,8 +10272,8 @@ struct llm_build_context {
10176
10272
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10177
10273
  cb(Qcur, "Qcur_scaled", il);
10178
10274
 
10179
- Kcur = ggml_rope_custom(
10180
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10275
+ Kcur = ggml_rope_ext(
10276
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10181
10277
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10182
10278
  ext_factor, attn_factor, beta_fast, beta_slow);
10183
10279
  cb(Kcur, "Kcur", il);
@@ -10287,15 +10383,15 @@ struct llm_build_context {
10287
10383
  cb(Vcur, "Vcur", il);
10288
10384
  }
10289
10385
 
10290
- Qcur = ggml_rope_custom(
10291
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10386
+ Qcur = ggml_rope_ext(
10387
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10292
10388
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10293
10389
  ext_factor, attn_factor, beta_fast, beta_slow
10294
10390
  );
10295
10391
  cb(Qcur, "Qcur", il);
10296
10392
 
10297
- Kcur = ggml_rope_custom(
10298
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10393
+ Kcur = ggml_rope_ext(
10394
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10299
10395
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10300
10396
  ext_factor, attn_factor, beta_fast, beta_slow
10301
10397
  );
@@ -10577,15 +10673,15 @@ struct llm_build_context {
10577
10673
  cb(Kcur, "Kcur", il);
10578
10674
  }
10579
10675
 
10580
- Qcur = ggml_rope_custom(
10581
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10676
+ Qcur = ggml_rope_ext(
10677
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10582
10678
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10583
10679
  ext_factor, attn_factor, beta_fast, beta_slow
10584
10680
  );
10585
10681
  cb(Qcur, "Qcur", il);
10586
10682
 
10587
- Kcur = ggml_rope_custom(
10588
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10683
+ Kcur = ggml_rope_ext(
10684
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10589
10685
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10590
10686
  ext_factor, attn_factor, beta_fast, beta_slow
10591
10687
  );
@@ -10680,8 +10776,269 @@ struct llm_build_context {
10680
10776
 
10681
10777
  // norm
10682
10778
  cur = llm_build_norm(ctx0, inpL, hparams,
10683
- NULL, NULL,
10684
- LLM_NORM, cb, il);
10779
+ NULL, NULL,
10780
+ LLM_NORM, cb, il);
10781
+ cb(cur, "attn_norm", il);
10782
+
10783
+ // self-attention
10784
+ {
10785
+ // compute Q and K and RoPE them
10786
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10787
+ cb(Qcur, "Qcur", il);
10788
+ if (hparams.f_clamp_kqv > 0.0f) {
10789
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10790
+ cb(Qcur, "Qcur", il);
10791
+ }
10792
+
10793
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10794
+ cb(Kcur, "Kcur", il);
10795
+ if (hparams.f_clamp_kqv > 0.0f) {
10796
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10797
+ cb(Kcur, "Kcur", il);
10798
+ }
10799
+
10800
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10801
+ cb(Vcur, "Vcur", il);
10802
+ if (hparams.f_clamp_kqv > 0.0f) {
10803
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10804
+ cb(Vcur, "Vcur", il);
10805
+ }
10806
+
10807
+ Qcur = ggml_rope_ext(
10808
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10809
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10810
+ ext_factor, attn_factor, beta_fast, beta_slow
10811
+ );
10812
+ cb(Qcur, "Qcur", il);
10813
+
10814
+ Kcur = ggml_rope_ext(
10815
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10816
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10817
+ ext_factor, attn_factor, beta_fast, beta_slow
10818
+ );
10819
+ cb(Kcur, "Kcur", il);
10820
+
10821
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10822
+ model.layers[il].wo, nullptr,
10823
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10824
+ }
10825
+
10826
+ if (il == n_layer - 1) {
10827
+ // skip computing output for unused tokens
10828
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10829
+ n_tokens = n_outputs;
10830
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10831
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10832
+ }
10833
+
10834
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10835
+ cb(ffn_inp, "ffn_inp", il);
10836
+
10837
+ // feed-forward network
10838
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10839
+ NULL, NULL,
10840
+ LLM_NORM, cb, il);
10841
+ cb(cur, "ffn_norm", il);
10842
+
10843
+ cur = llm_build_ffn(ctx0, cur,
10844
+ model.layers[il].ffn_up, NULL,
10845
+ model.layers[il].ffn_gate, NULL,
10846
+ model.layers[il].ffn_down, NULL,
10847
+ NULL,
10848
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10849
+ cb(cur, "ffn_out", il);
10850
+
10851
+ cur = ggml_add(ctx0, cur, ffn_inp);
10852
+ cb(cur, "ffn_out", il);
10853
+
10854
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10855
+ if (layer_dir != nullptr) {
10856
+ cur = ggml_add(ctx0, cur, layer_dir);
10857
+ }
10858
+ cb(cur, "l_out", il);
10859
+
10860
+ // input for next layer
10861
+ inpL = cur;
10862
+ }
10863
+
10864
+ cur = inpL;
10865
+
10866
+ cur = llm_build_norm(ctx0, cur, hparams,
10867
+ NULL, NULL,
10868
+ LLM_NORM, cb, -1);
10869
+ cb(cur, "result_norm", -1);
10870
+
10871
+ // lm_head
10872
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10873
+ cb(cur, "result_output", -1);
10874
+
10875
+ ggml_build_forward_expand(gf, cur);
10876
+
10877
+ return gf;
10878
+ }
10879
+
10880
+ struct ggml_cgraph * build_gptneox() {
10881
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10882
+
10883
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10884
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10885
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10886
+
10887
+ struct ggml_tensor * cur;
10888
+ struct ggml_tensor * inpL;
10889
+
10890
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10891
+
10892
+ // inp_pos - contains the positions
10893
+ struct ggml_tensor * inp_pos = build_inp_pos();
10894
+
10895
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10896
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10897
+
10898
+ for (int il = 0; il < n_layer; ++il) {
10899
+ cur = llm_build_norm(ctx0, inpL, hparams,
10900
+ model.layers[il].attn_norm,
10901
+ model.layers[il].attn_norm_b,
10902
+ LLM_NORM, cb, il);
10903
+ cb(cur, "attn_norm", il);
10904
+
10905
+ // self-attention
10906
+ {
10907
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10908
+ cb(cur, "wqkv", il);
10909
+
10910
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10911
+ cb(cur, "bqkv", il);
10912
+
10913
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10914
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10915
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10916
+
10917
+ cb(Qcur, "Qcur", il);
10918
+ cb(Kcur, "Kcur", il);
10919
+ cb(Vcur, "Vcur", il);
10920
+
10921
+ Qcur = ggml_rope_ext(
10922
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10923
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10924
+ ext_factor, attn_factor, beta_fast, beta_slow
10925
+ );
10926
+ cb(Qcur, "Qcur", il);
10927
+
10928
+ Kcur = ggml_rope_ext(
10929
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10930
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10931
+ ext_factor, attn_factor, beta_fast, beta_slow
10932
+ );
10933
+ cb(Kcur, "Kcur", il);
10934
+
10935
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10936
+ model.layers[il].wo, model.layers[il].bo,
10937
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10938
+ }
10939
+
10940
+ if (il == n_layer - 1) {
10941
+ // skip computing output for unused tokens
10942
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10943
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10944
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10945
+ }
10946
+
10947
+ // ffn
10948
+ if (hparams.use_par_res) {
10949
+ // attention and ffn are computed in parallel
10950
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10951
+
10952
+ struct ggml_tensor * attn_out = cur;
10953
+
10954
+ cur = llm_build_norm(ctx0, inpL, hparams,
10955
+ model.layers[il].ffn_norm,
10956
+ model.layers[il].ffn_norm_b,
10957
+ LLM_NORM, cb, il);
10958
+ cb(cur, "ffn_norm", il);
10959
+
10960
+ cur = llm_build_ffn(ctx0, cur,
10961
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10962
+ NULL, NULL,
10963
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10964
+ NULL,
10965
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10966
+ cb(cur, "ffn_out", il);
10967
+
10968
+ cur = ggml_add(ctx0, cur, inpL);
10969
+ cb(cur, "ffn_out", il);
10970
+
10971
+ inpL = ggml_add(ctx0, cur, attn_out);
10972
+ cb(inpL, "l_out", il);
10973
+ } else {
10974
+ // attention and ffn are computed sequentially
10975
+ // x = x + attn(ln1(x))
10976
+ // x = x + ffn(ln2(x))
10977
+
10978
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10979
+ cb(ffn_inp, "ffn_inp", il);
10980
+
10981
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10982
+ model.layers[il].ffn_norm,
10983
+ model.layers[il].ffn_norm_b,
10984
+ LLM_NORM, cb, il);
10985
+ cb(cur, "ffn_norm", il);
10986
+
10987
+ cur = llm_build_ffn(ctx0, cur,
10988
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10989
+ NULL, NULL,
10990
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10991
+ NULL,
10992
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10993
+ cb(cur, "ffn_out", il);
10994
+
10995
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10996
+ cb(inpL, "l_out", il);
10997
+ }
10998
+ }
10999
+
11000
+ cur = llm_build_norm(ctx0, inpL, hparams,
11001
+ model.output_norm,
11002
+ model.output_norm_b,
11003
+ LLM_NORM, cb, -1);
11004
+ cb(cur, "result_norm", -1);
11005
+
11006
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11007
+ cb(cur, "result_output", -1);
11008
+
11009
+ ggml_build_forward_expand(gf, cur);
11010
+
11011
+ return gf;
11012
+ }
11013
+
11014
+ struct ggml_cgraph * build_arctic() {
11015
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11016
+
11017
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11018
+ int32_t n_tokens = this->n_tokens;
11019
+
11020
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11021
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11022
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
11023
+
11024
+ struct ggml_tensor * cur;
11025
+ struct ggml_tensor * inpL;
11026
+
11027
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11028
+
11029
+ // inp_pos - contains the positions
11030
+ struct ggml_tensor * inp_pos = build_inp_pos();
11031
+
11032
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11033
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11034
+
11035
+ for (int il = 0; il < n_layer; ++il) {
11036
+ struct ggml_tensor * inpSA = inpL;
11037
+
11038
+ // norm
11039
+ cur = llm_build_norm(ctx0, inpL, hparams,
11040
+ model.layers[il].attn_norm, NULL,
11041
+ LLM_NORM_RMS, cb, il);
10685
11042
  cb(cur, "attn_norm", il);
10686
11043
 
10687
11044
  // self-attention
@@ -10689,41 +11046,29 @@ struct llm_build_context {
10689
11046
  // compute Q and K and RoPE them
10690
11047
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10691
11048
  cb(Qcur, "Qcur", il);
10692
- if (hparams.f_clamp_kqv > 0.0f) {
10693
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10694
- cb(Qcur, "Qcur", il);
10695
- }
10696
11049
 
10697
11050
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10698
11051
  cb(Kcur, "Kcur", il);
10699
- if (hparams.f_clamp_kqv > 0.0f) {
10700
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10701
- cb(Kcur, "Kcur", il);
10702
- }
10703
11052
 
10704
11053
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10705
11054
  cb(Vcur, "Vcur", il);
10706
- if (hparams.f_clamp_kqv > 0.0f) {
10707
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10708
- cb(Vcur, "Vcur", il);
10709
- }
10710
11055
 
10711
- Qcur = ggml_rope_custom(
10712
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
11056
+ Qcur = ggml_rope_ext(
11057
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10713
11058
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10714
11059
  ext_factor, attn_factor, beta_fast, beta_slow
10715
11060
  );
10716
11061
  cb(Qcur, "Qcur", il);
10717
11062
 
10718
- Kcur = ggml_rope_custom(
10719
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
11063
+ Kcur = ggml_rope_ext(
11064
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10720
11065
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10721
11066
  ext_factor, attn_factor, beta_fast, beta_slow
10722
11067
  );
10723
11068
  cb(Kcur, "Kcur", il);
10724
11069
 
10725
11070
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10726
- model.layers[il].wo, nullptr,
11071
+ model.layers[il].wo, NULL,
10727
11072
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10728
11073
  }
10729
11074
 
@@ -10740,8 +11085,8 @@ struct llm_build_context {
10740
11085
 
10741
11086
  // feed-forward network
10742
11087
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
10743
- NULL, NULL,
10744
- LLM_NORM, cb, il);
11088
+ model.layers[il].ffn_norm, NULL,
11089
+ LLM_NORM_RMS, cb, il);
10745
11090
  cb(cur, "ffn_norm", il);
10746
11091
 
10747
11092
  cur = llm_build_ffn(ctx0, cur,
@@ -10752,7 +11097,27 @@ struct llm_build_context {
10752
11097
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10753
11098
  cb(cur, "ffn_out", il);
10754
11099
 
10755
- cur = ggml_add(ctx0, cur, ffn_inp);
11100
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
11101
+ cb(ffn_out, "ffn_out", il);
11102
+
11103
+ // MoE
11104
+ cur = llm_build_norm(ctx0, inpSA, hparams,
11105
+ model.layers[il].ffn_norm_exps, NULL,
11106
+ LLM_NORM_RMS, cb, il);
11107
+ cb(cur, "ffn_norm_exps", il);
11108
+
11109
+ cur = llm_build_moe_ffn(ctx0, cur,
11110
+ model.layers[il].ffn_gate_inp,
11111
+ model.layers[il].ffn_up_exps,
11112
+ model.layers[il].ffn_gate_exps,
11113
+ model.layers[il].ffn_down_exps,
11114
+ n_expert, n_expert_used,
11115
+ LLM_FFN_SILU, true,
11116
+ false, 0.0,
11117
+ cb, il);
11118
+ cb(cur, "ffn_moe_out", il);
11119
+
11120
+ cur = ggml_add(ctx0, cur, ffn_out);
10756
11121
  cb(cur, "ffn_out", il);
10757
11122
 
10758
11123
  ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
@@ -10768,8 +11133,240 @@ struct llm_build_context {
10768
11133
  cur = inpL;
10769
11134
 
10770
11135
  cur = llm_build_norm(ctx0, cur, hparams,
10771
- NULL, NULL,
10772
- LLM_NORM, cb, -1);
11136
+ model.output_norm, NULL,
11137
+ LLM_NORM_RMS, cb, -1);
11138
+ cb(cur, "result_norm", -1);
11139
+
11140
+ // lm_head
11141
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11142
+ cb(cur, "result_output", -1);
11143
+
11144
+ ggml_build_forward_expand(gf, cur);
11145
+
11146
+ return gf;
11147
+ }
11148
+
11149
+ struct ggml_cgraph * build_deepseek2() {
11150
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11151
+
11152
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11153
+ int32_t n_tokens = this->n_tokens;
11154
+
11155
+ bool is_lite = (hparams.n_layer == 27);
11156
+
11157
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
11158
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
11159
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
11160
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
11161
+ const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
11162
+
11163
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11164
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11165
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11166
+
11167
+ struct ggml_tensor * cur;
11168
+ struct ggml_tensor * inpL;
11169
+
11170
+ // {n_embd, n_tokens}
11171
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11172
+
11173
+ // inp_pos - contains the positions
11174
+ struct ggml_tensor * inp_pos = build_inp_pos();
11175
+
11176
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11177
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11178
+
11179
+ for (int il = 0; il < n_layer; ++il) {
11180
+ struct ggml_tensor * inpSA = inpL;
11181
+
11182
+ // norm
11183
+ cur = llm_build_norm(ctx0, inpL, hparams,
11184
+ model.layers[il].attn_norm, NULL,
11185
+ LLM_NORM_RMS, cb, il);
11186
+ cb(cur, "attn_norm", il);
11187
+
11188
+ // self_attention
11189
+ {
11190
+ struct ggml_tensor * q = NULL;
11191
+ if (!is_lite) {
11192
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
11193
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
11194
+ cb(q, "q", il);
11195
+
11196
+ q = llm_build_norm(ctx0, q, hparams,
11197
+ model.layers[il].attn_q_a_norm, NULL,
11198
+ LLM_NORM_RMS, cb, il);
11199
+ cb(q, "q", il);
11200
+
11201
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
11202
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
11203
+ cb(q, "q", il);
11204
+ } else {
11205
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11206
+ cb(q, "q", il);
11207
+ }
11208
+
11209
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11210
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11211
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11212
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11213
+ 0);
11214
+ cb(q_nope, "q_nope", il);
11215
+
11216
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11217
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11218
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11219
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11220
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11221
+ cb(q_pe, "q_pe", il);
11222
+
11223
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11224
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11225
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11226
+
11227
+ // split into {kv_lora_rank, n_tokens}
11228
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11229
+ kv_pe_compresseed->nb[1],
11230
+ 0);
11231
+ cb(kv_compressed, "kv_compressed", il);
11232
+
11233
+ // and {n_embd_head_qk_rope, n_tokens}
11234
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11235
+ kv_pe_compresseed->nb[1],
11236
+ kv_pe_compresseed->nb[1],
11237
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11238
+ cb(k_pe, "k_pe", il);
11239
+
11240
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11241
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11242
+ model.layers[il].attn_kv_a_norm, NULL,
11243
+ LLM_NORM_RMS, cb, il);
11244
+ cb(kv_compressed, "kv_compressed", il);
11245
+
11246
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11247
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11248
+ cb(kv, "kv", il);
11249
+
11250
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11251
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11252
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11253
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11254
+ 0);
11255
+ cb(k_nope, "k_nope", il);
11256
+
11257
+ // and {n_head * n_embd_head_v, n_tokens}
11258
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11259
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11260
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11261
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11262
+ cb(v_states, "v_states", il);
11263
+
11264
+ v_states = ggml_cont(ctx0, v_states);
11265
+ cb(v_states, "v_states", il);
11266
+
11267
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11268
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11269
+ 0);
11270
+ cb(v_states, "v_states", il);
11271
+
11272
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
+ q_pe = ggml_rope_ext(
11274
+ ctx0, q_pe, inp_pos, nullptr,
11275
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11276
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
+ );
11278
+ cb(q_pe, "q_pe", il);
11279
+
11280
+ // shared RoPE key
11281
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
+ k_pe = ggml_rope_ext(
11283
+ ctx0, k_pe, inp_pos, nullptr,
11284
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11285
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
+ );
11287
+ cb(k_pe, "k_pe", il);
11288
+
11289
+ struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11290
+ cb(q_states, "q_states", il);
11291
+
11292
+ struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11293
+ cb(k_states, "k_states", il);
11294
+
11295
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11296
+ model.layers[il].wo, NULL,
11297
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11298
+ }
11299
+
11300
+ if (il == n_layer - 1) {
11301
+ // skip computing output for unused tokens
11302
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11303
+ n_tokens = n_outputs;
11304
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11305
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11306
+ }
11307
+
11308
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11309
+ cb(ffn_inp, "ffn_inp", il);
11310
+
11311
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
11312
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11313
+ model.layers[il].ffn_norm, NULL,
11314
+ LLM_NORM_RMS, cb, il);
11315
+ cb(cur, "ffn_norm", il);
11316
+
11317
+ cur = llm_build_ffn(ctx0, cur,
11318
+ model.layers[il].ffn_up, NULL,
11319
+ model.layers[il].ffn_gate, NULL,
11320
+ model.layers[il].ffn_down, NULL,
11321
+ NULL,
11322
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11323
+ cb(cur, "ffn_out", il);
11324
+ } else {
11325
+ // MoE branch
11326
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11327
+ model.layers[il].ffn_norm, NULL,
11328
+ LLM_NORM_RMS, cb, il);
11329
+ cb(cur, "ffn_norm", il);
11330
+
11331
+ ggml_tensor * moe_out =
11332
+ llm_build_moe_ffn(ctx0, cur,
11333
+ model.layers[il].ffn_gate_inp,
11334
+ model.layers[il].ffn_up_exps,
11335
+ model.layers[il].ffn_gate_exps,
11336
+ model.layers[il].ffn_down_exps,
11337
+ n_expert, n_expert_used,
11338
+ LLM_FFN_SILU, false,
11339
+ true, hparams.expert_weights_scale,
11340
+ cb, il);
11341
+ cb(moe_out, "ffn_moe_out", il);
11342
+
11343
+ // FFN shared expert
11344
+ {
11345
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
11346
+ model.layers[il].ffn_up_shexp, NULL,
11347
+ model.layers[il].ffn_gate_shexp, NULL,
11348
+ model.layers[il].ffn_down_shexp, NULL,
11349
+ NULL,
11350
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11351
+ cb(ffn_shexp, "ffn_shexp", il);
11352
+
11353
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
11354
+ cb(cur, "ffn_out", il);
11355
+ }
11356
+ }
11357
+
11358
+ cur = ggml_add(ctx0, cur, ffn_inp);
11359
+ cb(cur, "l_out", il);
11360
+
11361
+ // input for next layer
11362
+ inpL = cur;
11363
+ }
11364
+
11365
+ cur = inpL;
11366
+
11367
+ cur = llm_build_norm(ctx0, cur, hparams,
11368
+ model.output_norm, NULL,
11369
+ LLM_NORM_RMS, cb, -1);
10773
11370
  cb(cur, "result_norm", -1);
10774
11371
 
10775
11372
  // lm_head
@@ -10780,6 +11377,7 @@ struct llm_build_context {
10780
11377
 
10781
11378
  return gf;
10782
11379
  }
11380
+
10783
11381
  };
10784
11382
 
10785
11383
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -10896,10 +11494,6 @@ static struct ggml_cgraph * llama_build_graph(
10896
11494
  {
10897
11495
  result = llm.build_starcoder();
10898
11496
  } break;
10899
- case LLM_ARCH_PERSIMMON:
10900
- {
10901
- result = llm.build_persimmon();
10902
- } break;
10903
11497
  case LLM_ARCH_REFACT:
10904
11498
  {
10905
11499
  result = llm.build_refact();
@@ -10994,6 +11588,18 @@ static struct ggml_cgraph * llama_build_graph(
10994
11588
  {
10995
11589
  result = llm.build_olmo();
10996
11590
  } break;
11591
+ case LLM_ARCH_GPTNEOX:
11592
+ {
11593
+ result = llm.build_gptneox();
11594
+ } break;
11595
+ case LLM_ARCH_ARCTIC:
11596
+ {
11597
+ result = llm.build_arctic();
11598
+ } break;
11599
+ case LLM_ARCH_DEEPSEEK2:
11600
+ {
11601
+ result = llm.build_deepseek2();
11602
+ } break;
10997
11603
  default:
10998
11604
  GGML_ASSERT(false);
10999
11605
  }
@@ -11339,11 +11945,6 @@ static void llama_graph_compute(
11339
11945
  llama_context & lctx,
11340
11946
  ggml_cgraph * gf,
11341
11947
  int n_threads) {
11342
- #ifdef GGML_USE_MPI
11343
- const int64_t n_layer = lctx.model.hparams.n_layer;
11344
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11345
- #endif
11346
-
11347
11948
  #ifdef GGML_USE_METAL
11348
11949
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11349
11950
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11358,10 +11959,6 @@ static void llama_graph_compute(
11358
11959
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11359
11960
 
11360
11961
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11361
-
11362
- #ifdef GGML_USE_MPI
11363
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11364
- #endif
11365
11962
  }
11366
11963
 
11367
11964
  // decode a batch of tokens by evaluating the transformer
@@ -11399,12 +11996,6 @@ static int llama_decode_internal(
11399
11996
  }
11400
11997
  lctx.n_queued_tokens += n_tokens_all;
11401
11998
 
11402
- #ifdef GGML_USE_MPI
11403
- // TODO: needs fix after #3228
11404
- GGML_ASSERT(false && "not implemented");
11405
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11406
- #endif
11407
-
11408
11999
  auto & kv_self = lctx.kv_self;
11409
12000
 
11410
12001
  const int64_t n_embd = hparams.n_embd;
@@ -12298,6 +12889,7 @@ struct llm_tokenizer_bpe {
12298
12889
  });
12299
12890
  break;
12300
12891
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12892
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12301
12893
  word_collection = unicode_regex_split(text, {
12302
12894
  // same as llama3
12303
12895
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12354,6 +12946,7 @@ struct llm_tokenizer_bpe {
12354
12946
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12355
12947
  });
12356
12948
  break;
12949
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12357
12950
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12358
12951
  word_collection = unicode_regex_split(text, {
12359
12952
  // original regex from tokenizer.json
@@ -12519,7 +13112,7 @@ struct llm_tokenizer_wpm {
12519
13112
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
12520
13113
 
12521
13114
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12522
- auto * token_map = &vocab.token_to_id;
13115
+ const auto & token_map = vocab.token_to_id;
12523
13116
 
12524
13117
  // normalize and split by whitespace
12525
13118
  std::vector<std::string> words = preprocess(text);
@@ -12534,108 +13127,89 @@ struct llm_tokenizer_wpm {
12534
13127
  }
12535
13128
 
12536
13129
  // prepend phantom space
12537
- std::string word1 = "\xe2\x96\x81" + word;
12538
- int n = word1.size();
13130
+ const std::string word1 = "\xe2\x96\x81" + word;
13131
+ const int n = word1.size();
12539
13132
 
12540
- // we're at the start of a new word
12541
- int i = 0;
12542
- bool match_any = false;
13133
+ const size_t current_tokens = output.size();
12543
13134
 
13135
+ // we're at the start of a new word
12544
13136
  // move through character position in word
12545
- while (i < n) {
13137
+ for (int i = 0; i < n; ++i) {
12546
13138
  // loop through possible match length
12547
13139
  bool match = false;
12548
13140
  for (int j = n; j > i; j--) {
12549
- auto it = token_map->find(word1.substr(i, j - i));
12550
- if (it != token_map->end()) {
13141
+ auto it = token_map.find(word1.substr(i, j - i));
13142
+ if (it != token_map.end()) {
12551
13143
  output.push_back(it->second);
12552
13144
  match = true;
12553
- match_any = true;
12554
- i = j;
13145
+ i = j - 1;
12555
13146
  break;
12556
13147
  }
12557
13148
  }
12558
13149
 
12559
- // must be an unknown character
12560
- if (!match) {
12561
- i++;
13150
+ if (!match) { // discard all
13151
+ output.resize(current_tokens);
13152
+ break; // and discard next tokens
12562
13153
  }
12563
13154
  }
12564
13155
 
12565
13156
  // we didn't find any matches for this word
12566
- if (!match_any) {
13157
+ if (current_tokens == output.size()) {
12567
13158
  output.push_back(vocab.special_unk_id);
12568
13159
  }
12569
13160
  }
12570
13161
  }
12571
13162
 
12572
13163
  std::vector<std::string> preprocess(const std::string & text) {
12573
- std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
12574
-
12575
- // strip accents, strip control, uniformize whitespace,
12576
- // to lowercase, pad chinese characters, pad punctuation
12577
- std::string new_str = "";
12578
- for (uint32_t code : cpts_nfd) {
12579
- const codepoint_flags flags = unicode_cpt_flags(code);
12580
- if (flags.is_accent_mark || flags.is_control) {
13164
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13165
+ std::vector<std::string> words(1, "");
13166
+
13167
+ for (const char32_t cpt : cpts_nfd) {
13168
+ const auto flags = unicode_cpt_flags(cpt);
13169
+
13170
+ if (flags.is_whitespace) {
13171
+ if (words.back().size()) { // finish previous word if any
13172
+ words.emplace_back();
13173
+ }
12581
13174
  continue;
12582
13175
  }
12583
- code = unicode_tolower(code);
12584
- if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12585
- code = ' ';
12586
- }
12587
- std::string s = unicode_cpt_to_utf8(code);
12588
- if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12589
- new_str += " ";
12590
- new_str += s;
12591
- new_str += " ";
12592
- } else {
12593
- new_str += s;
13176
+
13177
+ assert (!flags.is_separator);
13178
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
13179
+ continue;
12594
13180
  }
12595
- }
12596
13181
 
12597
- // split by whitespace
12598
- uint64_t l = 0;
12599
- uint64_t r = 0;
12600
- std::vector<std::string> words;
12601
- while (r < new_str.size()) {
12602
- // if is whitespace
12603
- if (isspace(new_str[r], std::locale::classic())) {
12604
- if (r > l) words.push_back(new_str.substr(l, (r - l)));
12605
- l = r + 1;
12606
- r = l;
13182
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
13183
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
13184
+ if (words.back().size()) { // finish previous word if any
13185
+ words.emplace_back();
13186
+ }
13187
+ words.back() = s; // single char word
13188
+ words.emplace_back(); // start a new word
12607
13189
  } else {
12608
- r += 1;
13190
+ words.back() += s; // append char to word
12609
13191
  }
12610
13192
  }
12611
- if (r > l) {
12612
- words.push_back(new_str.substr(l, (r - l)));
12613
- }
12614
- return words;
12615
- }
12616
13193
 
12617
- bool is_ascii_punct(uint32_t code) {
12618
- if (code > 0xFF) {
12619
- return false;
13194
+ if (!words.back().size()) {
13195
+ words.pop_back();
12620
13196
  }
12621
- auto c = char(static_cast<unsigned char>(code));
12622
- return ispunct(c, std::locale::classic());
13197
+
13198
+ return words;
12623
13199
  }
12624
13200
 
12625
- bool is_chinese_char(uint32_t cpt) {
12626
- if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
12627
- (cpt >= 0x3400 && cpt <= 0x4DBF) ||
13201
+ static bool is_chinese_char(uint32_t cpt) {
13202
+ return
13203
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
13204
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
12628
13205
  (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
12629
13206
  (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
12630
13207
  (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
12631
13208
  (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
12632
- (cpt >= 0xF900 && cpt <= 0xFAFF) ||
12633
- (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
12634
- (cpt >= 0x3000 && cpt <= 0x303F) ||
12635
- (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
12636
- return true; // NOLINT
12637
- }
12638
- return false;
13209
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
13210
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
13211
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
13212
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
12639
13213
  }
12640
13214
 
12641
13215
  const llama_vocab & vocab;
@@ -12679,9 +13253,8 @@ struct fragment_buffer_variant {
12679
13253
 
12680
13254
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
12681
13255
  // for each special token
12682
- for (const auto & st: vocab.special_tokens_cache) {
12683
- const auto & special_token = st.first;
12684
- const auto & special_id = st.second;
13256
+ for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
+ const auto & special_token = vocab.id_to_token[special_id].text;
12685
13258
 
12686
13259
  // for each text fragment
12687
13260
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -12690,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12690
13263
 
12691
13264
  // if a fragment is text ( not yet processed )
12692
13265
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12693
- auto * raw_text = &(fragment.raw_text);
13266
+ auto & raw_text = fragment.raw_text;
12694
13267
 
12695
13268
  auto raw_text_base_offset = fragment.offset;
12696
13269
  auto raw_text_base_length = fragment.length;
@@ -12700,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12700
13273
  // find the first occurrence of a given special token in this fragment
12701
13274
  // passing offset argument only limit the "search area" but match coordinates
12702
13275
  // are still relative to the source full raw_text
12703
- auto match = raw_text->find(special_token, raw_text_base_offset);
13276
+ auto match = raw_text.find(special_token, raw_text_base_offset);
12704
13277
 
12705
13278
  // no occurrences found, stop processing this fragment for a given special token
12706
13279
  if (match == std::string::npos) break;
@@ -12719,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12719
13292
  // left
12720
13293
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
12721
13294
  const int64_t left_reminder_length = match - raw_text_base_offset;
12722
- buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
13295
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
12723
13296
 
12724
13297
  #ifdef PRETOKENIZERDEBUG
12725
13298
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
@@ -12735,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12735
13308
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
12736
13309
  const int64_t right_reminder_offset = match + special_token.length();
12737
13310
  const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
12738
- buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
13311
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
12739
13312
 
12740
13313
  #ifdef PRETOKENIZERDEBUG
12741
13314
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
@@ -12788,9 +13361,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12788
13361
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12789
13362
  // tokenizer.encode('', add_special_tokens=False) returns []
12790
13363
 
13364
+ static const bool rtrim = true; //TODO: as param
13365
+ bool is_prev_special = false;
13366
+ bool special_token_rtrim = false;
13367
+
12791
13368
  if (add_special && vocab.special_add_bos != 0) {
12792
13369
  GGML_ASSERT(vocab.special_bos_id != -1);
12793
13370
  output.push_back(vocab.special_bos_id);
13371
+ is_prev_special = true;
12794
13372
  }
12795
13373
 
12796
13374
  for (const auto & fragment : fragment_buffer) {
@@ -12802,9 +13380,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12802
13380
  // and passing 'add space prefix' as bool argument
12803
13381
  //
12804
13382
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12805
- if (&fragment == &fragment_buffer.front()) {
12806
- if (vocab.add_space_prefix) {
12807
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13383
+
13384
+ if (special_token_rtrim) {
13385
+ size_t num_whitespaces = 0;
13386
+ while (isspace(raw_text[num_whitespaces])) {
13387
+ num_whitespaces++;
13388
+ }
13389
+ if (num_whitespaces == raw_text.size()) {
13390
+ continue; // skip if all whitespaces
13391
+ }
13392
+ raw_text = raw_text.substr(num_whitespaces);
13393
+ }
13394
+
13395
+ if (vocab.add_space_prefix) {
13396
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13397
+ raw_text = " " + raw_text;
12808
13398
  }
12809
13399
  }
12810
13400
 
@@ -12816,6 +13406,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12816
13406
  tokenizer.tokenize(raw_text, output);
12817
13407
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12818
13408
  output.push_back(fragment.token);
13409
+ is_prev_special = true;
13410
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13411
+ special_token_rtrim = rtrim
13412
+ && fragment.token != vocab.special_bos_id
13413
+ && fragment.token != vocab.special_unk_id
13414
+ && fragment.token != vocab.special_eos_id;
12819
13415
  }
12820
13416
  }
12821
13417
 
@@ -13816,7 +14412,7 @@ void llama_sample_repetition_penalties(
13816
14412
 
13817
14413
  void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
13818
14414
  GGML_ASSERT(ctx);
13819
- const int64_t t_start_sample_us = ggml_time_us();
14415
+ int64_t t_start_sample_us = ggml_time_us();
13820
14416
 
13821
14417
  bool allow_eog = false;
13822
14418
  for (const auto & stack : grammar->stacks) {
@@ -13828,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13828
14424
 
13829
14425
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
13830
14426
  candidates_decoded.reserve(candidates->size);
13831
- std::vector<llama_grammar_candidate> candidates_grammar;
14427
+
14428
+ std::vector<llama_grammar_candidate> candidates_grammar;
13832
14429
  candidates_grammar.reserve(candidates->size);
13833
14430
 
13834
14431
  for (size_t i = 0; i < candidates->size; ++i) {
13835
- const llama_token id = candidates->data[i].id;
13836
- const std::string piece = llama_token_to_piece(ctx, id, false);
14432
+ const llama_token id = candidates->data[i].id;
14433
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
13837
14434
 
13838
14435
  if (llama_token_is_eog(&ctx->model, id)) {
13839
14436
  if (!allow_eog) {
@@ -14033,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14033
14630
  GGML_ASSERT(false);
14034
14631
  }
14035
14632
 
14036
- const std::string piece = llama_token_to_piece(ctx, token, false);
14633
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
14037
14634
 
14038
14635
  // Note terminating 0 in decoded string
14039
14636
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14518,8 +15115,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14518
15115
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14519
15116
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14520
15117
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14521
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14522
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14523
15118
  if (qs.model.type == MODEL_70B) {
14524
15119
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14525
15120
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15533,10 +16128,6 @@ void llama_backend_init(void) {
15533
16128
  struct ggml_context * ctx = ggml_init(params);
15534
16129
  ggml_free(ctx);
15535
16130
  }
15536
-
15537
- #ifdef GGML_USE_MPI
15538
- ggml_mpi_backend_init();
15539
- #endif
15540
16131
  }
15541
16132
 
15542
16133
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15546,9 +16137,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15546
16137
  }
15547
16138
 
15548
16139
  void llama_backend_free(void) {
15549
- #ifdef GGML_USE_MPI
15550
- ggml_mpi_backend_free();
15551
- #endif
15552
16140
  ggml_quantize_free();
15553
16141
  }
15554
16142
 
@@ -15691,6 +16279,7 @@ struct llama_context * llama_new_context_with_model(
15691
16279
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15692
16280
  }
15693
16281
 
16282
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15694
16283
  cparams.causal_attn = hparams.causal_attn;
15695
16284
 
15696
16285
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15949,20 +16538,6 @@ struct llama_context * llama_new_context_with_model(
15949
16538
  }
15950
16539
  }
15951
16540
 
15952
- #ifdef GGML_USE_MPI
15953
- ctx->ctx_mpi = ggml_mpi_init();
15954
-
15955
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15956
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15957
- // TODO: needs fix after #3228
15958
- GGML_ASSERT(false && "not implemented");
15959
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15960
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15961
- llama_backend_free();
15962
- exit(1);
15963
- }
15964
- #endif
15965
-
15966
16541
  return ctx;
15967
16542
  }
15968
16543
 
@@ -15999,7 +16574,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15999
16574
  // these models do not use RoPE
16000
16575
  case LLM_ARCH_GPT2:
16001
16576
  case LLM_ARCH_GPTJ:
16002
- case LLM_ARCH_GPTNEOX:
16003
16577
  case LLM_ARCH_MPT:
16004
16578
  case LLM_ARCH_REFACT:
16005
16579
  case LLM_ARCH_BLOOM:
@@ -16019,13 +16593,14 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16019
16593
  case LLM_ARCH_XVERSE:
16020
16594
  case LLM_ARCH_COMMAND_R:
16021
16595
  case LLM_ARCH_OLMO:
16596
+ case LLM_ARCH_ARCTIC:
16597
+ case LLM_ARCH_DEEPSEEK2:
16022
16598
  return LLAMA_ROPE_TYPE_NORM;
16023
16599
 
16024
16600
  // the pairs of head values are offset by n_rot/2
16025
16601
  case LLM_ARCH_FALCON:
16026
16602
  case LLM_ARCH_GROK:
16027
16603
  case LLM_ARCH_DBRX:
16028
- case LLM_ARCH_PERSIMMON:
16029
16604
  case LLM_ARCH_BERT:
16030
16605
  case LLM_ARCH_NOMIC_BERT:
16031
16606
  case LLM_ARCH_STABLELM:
@@ -16036,6 +16611,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16036
16611
  case LLM_ARCH_PHI3:
16037
16612
  case LLM_ARCH_GEMMA:
16038
16613
  case LLM_ARCH_STARCODER2:
16614
+ case LLM_ARCH_GPTNEOX:
16039
16615
  return LLAMA_ROPE_TYPE_NEOX;
16040
16616
 
16041
16617
  // all model arches should be listed explicitly here
@@ -16195,6 +16771,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16195
16771
  }
16196
16772
 
16197
16773
  // make tensors
16774
+ cvec.tensors.reserve(model.hparams.n_layer);
16198
16775
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16199
16776
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16200
16777
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16203,6 +16780,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16203
16780
  }
16204
16781
 
16205
16782
  // allocate tensors / buffers and zero
16783
+ cvec.ctxs.reserve(ctx_map.size());
16784
+ cvec.bufs.reserve(ctx_map.size());
16206
16785
  for (auto it : ctx_map) {
16207
16786
  ggml_backend_buffer_type_t buft = it.first;
16208
16787
  ggml_context * ctx = it.second;
@@ -17411,6 +17990,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17411
17990
  ctx->cparams.n_threads_batch = n_threads_batch;
17412
17991
  }
17413
17992
 
17993
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17994
+ return ctx->cparams.n_threads;
17995
+ }
17996
+
17997
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17998
+ return ctx->cparams.n_threads_batch;
17999
+ }
18000
+
17414
18001
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17415
18002
  ctx->abort_callback = abort_callback;
17416
18003
  ctx->abort_callback_data = abort_callback_data;
@@ -17634,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17634
18221
  );
17635
18222
  }
17636
18223
 
18224
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
18225
+ return llama_is_control_token(model->vocab, token);
18226
+ }
18227
+
17637
18228
  llama_token llama_token_bos(const struct llama_model * model) {
17638
18229
  return model->vocab.special_bos_id;
17639
18230
  }
@@ -17705,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
17705
18296
 
17706
18297
  const auto cpts = unicode_cpts_from_utf8(text);
17707
18298
  for (const auto cpt : cpts) {
17708
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
18299
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
18300
+ try {
18301
+ decoded_text += unicode_utf8_to_byte(utf8);
18302
+ } catch (const std::out_of_range & e) {
18303
+ decoded_text += "[UNK_BYTE_0x";
18304
+ for (const auto c : utf8) {
18305
+ decoded_text += format("%02x", (uint8_t) c);
18306
+ }
18307
+ decoded_text += text + "]";
18308
+ }
17709
18309
  }
17710
18310
 
17711
18311
  return decoded_text;
@@ -17713,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
17713
18313
 
17714
18314
  // does not write null-terminator to buf
17715
18315
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18316
+ // if we have a cache - use it
18317
+ {
18318
+ const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18319
+
18320
+ if (!cache.empty()) {
18321
+ const auto & res = cache.at(token);
18322
+ if (length < (int) res.size()) {
18323
+ return -(int) res.size();
18324
+ }
18325
+ memcpy(buf, res.c_str(), res.size());
18326
+ return res.size();
18327
+ }
18328
+ }
18329
+
17716
18330
  if (0 <= token && token < llama_n_vocab(model)) {
17717
18331
  switch (llama_vocab_get_type(model->vocab)) {
17718
- case LLAMA_VOCAB_TYPE_WPM:
17719
- case LLAMA_VOCAB_TYPE_SPM: {
17720
- // NOTE: we accept all unsupported token types,
17721
- // suppressing them like CONTROL tokens.
17722
- if (llama_is_normal_token(model->vocab, token)) {
17723
- std::string result = model->vocab.id_to_token[token].text;
17724
- llama_unescape_whitespace(result);
17725
- if (length < (int) result.length()) {
17726
- return -(int) result.length();
17727
- }
17728
- memcpy(buf, result.c_str(), result.length());
17729
- return result.length();
17730
- } else if (
17731
- (llama_is_user_defined_token(model->vocab, token)) ||
17732
- (llama_is_control_token (model->vocab, token) && special)) {
17733
- std::string result = model->vocab.id_to_token[token].text;
17734
- if (length < (int) result.length()) {
17735
- return -(int) result.length();
17736
- }
17737
- memcpy(buf, result.c_str(), result.length());
17738
- return result.length();
17739
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
17740
- if (length < 3) {
17741
- return -3;
17742
- }
17743
- memcpy(buf, "\xe2\x96\x85", 3);
17744
- return 3;
17745
- } else if (llama_is_byte_token(model->vocab, token)) {
17746
- if (length < 1) {
17747
- return -1;
18332
+ case LLAMA_VOCAB_TYPE_WPM:
18333
+ case LLAMA_VOCAB_TYPE_SPM: {
18334
+ // NOTE: we accept all unsupported token types,
18335
+ // suppressing them like CONTROL tokens.
18336
+ if (llama_is_normal_token(model->vocab, token)) {
18337
+ std::string result = model->vocab.id_to_token[token].text;
18338
+ llama_unescape_whitespace(result);
18339
+ if (length < (int) result.length()) {
18340
+ return -(int) result.length();
18341
+ }
18342
+ memcpy(buf, result.c_str(), result.length());
18343
+ return result.length();
18344
+ } else if (
18345
+ (llama_is_user_defined_token(model->vocab, token)) ||
18346
+ (llama_is_control_token (model->vocab, token) && special)) {
18347
+ std::string result = model->vocab.id_to_token[token].text;
18348
+ if (length < (int) result.length()) {
18349
+ return -(int) result.length();
18350
+ }
18351
+ memcpy(buf, result.c_str(), result.length());
18352
+ return result.length();
18353
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18354
+ if (length < 3) {
18355
+ return -3;
18356
+ }
18357
+ memcpy(buf, "\xe2\x96\x85", 3);
18358
+ return 3;
18359
+ } else if (llama_is_byte_token(model->vocab, token)) {
18360
+ if (length < 1) {
18361
+ return -1;
18362
+ }
18363
+ buf[0] = llama_token_to_byte(model->vocab, token);
18364
+ return 1;
17748
18365
  }
17749
- buf[0] = llama_token_to_byte(model->vocab, token);
17750
- return 1;
18366
+ break;
17751
18367
  }
17752
- break;
17753
- }
17754
- case LLAMA_VOCAB_TYPE_BPE: {
17755
- // NOTE: we accept all unsupported token types,
17756
- // suppressing them like CONTROL tokens.
17757
- if (llama_is_normal_token(model->vocab, token)) {
17758
- std::string result = model->vocab.id_to_token[token].text;
17759
- result = llama_decode_text(result);
17760
- if (length < (int) result.length()) {
17761
- return -(int) result.length();
17762
- }
17763
- memcpy(buf, result.c_str(), result.length());
17764
- return result.length();
17765
- } else if (
17766
- (llama_is_user_defined_token(model->vocab, token)) ||
17767
- (llama_is_control_token (model->vocab, token) && special)) {
17768
- std::string result = model->vocab.id_to_token[token].text;
17769
- if (length < (int) result.length()) {
17770
- return -(int) result.length();
18368
+ case LLAMA_VOCAB_TYPE_BPE: {
18369
+ // NOTE: we accept all unsupported token types,
18370
+ // suppressing them like CONTROL tokens.
18371
+ if (llama_is_normal_token(model->vocab, token)) {
18372
+ std::string result = model->vocab.id_to_token[token].text;
18373
+ result = llama_decode_text(result);
18374
+ if (length < (int) result.length()) {
18375
+ return -(int) result.length();
18376
+ }
18377
+ memcpy(buf, result.c_str(), result.length());
18378
+ return result.length();
18379
+ } else if (
18380
+ (llama_is_user_defined_token(model->vocab, token)) ||
18381
+ (llama_is_control_token (model->vocab, token) && special)) {
18382
+ std::string result = model->vocab.id_to_token[token].text;
18383
+ if (length < (int) result.length()) {
18384
+ return -(int) result.length();
18385
+ }
18386
+ memcpy(buf, result.c_str(), result.length());
18387
+ return result.length();
17771
18388
  }
17772
- memcpy(buf, result.c_str(), result.length());
17773
- return result.length();
18389
+ break;
17774
18390
  }
17775
- break;
17776
- }
17777
- default:
17778
- GGML_ASSERT(false);
18391
+ default:
18392
+ GGML_ASSERT(false);
17779
18393
  }
17780
18394
  }
17781
18395
  return 0;
@@ -17845,6 +18459,15 @@ static int32_t llama_chat_apply_template_internal(
17845
18459
  }
17846
18460
  }
17847
18461
  // llama2 templates seem to not care about "add_generation_prompt"
18462
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18463
+ // Phi 3
18464
+ for (auto message : chat) {
18465
+ std::string role(message->role);
18466
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18467
+ }
18468
+ if (add_ass) {
18469
+ ss << "<|assistant|>\n";
18470
+ }
17848
18471
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17849
18472
  // zephyr template
17850
18473
  for (auto message : chat) {
@@ -17977,15 +18600,6 @@ static int32_t llama_chat_apply_template_internal(
17977
18600
  if (add_ass) {
17978
18601
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17979
18602
  }
17980
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17981
- // Phi 3
17982
- for (auto message : chat) {
17983
- std::string role(message->role);
17984
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17985
- }
17986
- if (add_ass) {
17987
- ss << "<|assistant|>\n";
17988
- }
17989
18603
  } else {
17990
18604
  // template not supported
17991
18605
  return -1;
@@ -18107,8 +18721,10 @@ const char * llama_print_system_info(void) {
18107
18721
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
18108
18722
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
18109
18723
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18724
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18110
18725
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18111
18726
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18727
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18112
18728
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18113
18729
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18114
18730
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
@@ -18167,6 +18783,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
18167
18783
  g_state.log_callback_user_data = user_data;
18168
18784
  #ifdef GGML_USE_METAL
18169
18785
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18786
+ #elif defined(GGML_USE_CUDA)
18787
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18170
18788
  #endif
18171
18789
  }
18172
18790