llama_cpp 0.15.2 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,16 +26,9 @@
26
26
  #ifdef GGML_USE_METAL
27
27
  # include "ggml-metal.h"
28
28
  #endif
29
- #ifdef GGML_USE_MPI
30
- # include "ggml-mpi.h"
31
- #endif
32
- #ifndef QK_K
33
- # ifdef GGML_QKK_64
34
- # define QK_K 64
35
- # else
36
- # define QK_K 256
37
- # endif
38
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
39
32
 
40
33
  #ifdef __has_include
41
34
  #if __has_include(<unistd.h>)
@@ -110,7 +103,7 @@
110
103
  #endif
111
104
 
112
105
  #define LLAMA_MAX_NODES 8192
113
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 160
114
107
 
115
108
  //
116
109
  // logging
@@ -205,7 +198,6 @@ enum llm_arch {
205
198
  LLM_ARCH_GPTNEOX,
206
199
  LLM_ARCH_MPT,
207
200
  LLM_ARCH_STARCODER,
208
- LLM_ARCH_PERSIMMON,
209
201
  LLM_ARCH_REFACT,
210
202
  LLM_ARCH_BERT,
211
203
  LLM_ARCH_NOMIC_BERT,
@@ -229,6 +221,8 @@ enum llm_arch {
229
221
  LLM_ARCH_COMMAND_R,
230
222
  LLM_ARCH_DBRX,
231
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
225
+ LLM_ARCH_DEEPSEEK2,
232
226
  LLM_ARCH_UNKNOWN,
233
227
  };
234
228
 
@@ -242,7 +236,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
242
236
  { LLM_ARCH_MPT, "mpt" },
243
237
  { LLM_ARCH_BAICHUAN, "baichuan" },
244
238
  { LLM_ARCH_STARCODER, "starcoder" },
245
- { LLM_ARCH_PERSIMMON, "persimmon" },
246
239
  { LLM_ARCH_REFACT, "refact" },
247
240
  { LLM_ARCH_BERT, "bert" },
248
241
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
@@ -266,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
259
  { LLM_ARCH_COMMAND_R, "command-r" },
267
260
  { LLM_ARCH_DBRX, "dbrx" },
268
261
  { LLM_ARCH_OLMO, "olmo" },
262
+ { LLM_ARCH_ARCTIC, "arctic" },
263
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
269
264
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
265
  };
271
266
 
@@ -286,11 +281,15 @@ enum llm_kv {
286
281
  LLM_KV_CONTEXT_LENGTH,
287
282
  LLM_KV_EMBEDDING_LENGTH,
288
283
  LLM_KV_BLOCK_COUNT,
284
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
289
285
  LLM_KV_FEED_FORWARD_LENGTH,
286
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
290
287
  LLM_KV_USE_PARALLEL_RESIDUAL,
291
288
  LLM_KV_TENSOR_DATA_LAYOUT,
292
289
  LLM_KV_EXPERT_COUNT,
293
290
  LLM_KV_EXPERT_USED_COUNT,
291
+ LLM_KV_EXPERT_SHARED_COUNT,
292
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
294
293
  LLM_KV_POOLING_TYPE,
295
294
  LLM_KV_LOGIT_SCALE,
296
295
 
@@ -303,14 +302,18 @@ enum llm_kv {
303
302
  LLM_KV_ATTENTION_LAYERNORM_EPS,
304
303
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
305
304
  LLM_KV_ATTENTION_CAUSAL,
305
+ LLM_KV_ATTENTION_Q_LORA_RANK,
306
+ LLM_KV_ATTENTION_KV_LORA_RANK,
306
307
 
307
308
  LLM_KV_ROPE_DIMENSION_COUNT,
308
309
  LLM_KV_ROPE_FREQ_BASE,
309
310
  LLM_KV_ROPE_SCALE_LINEAR,
310
311
  LLM_KV_ROPE_SCALING_TYPE,
311
312
  LLM_KV_ROPE_SCALING_FACTOR,
313
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
312
314
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
313
315
  LLM_KV_ROPE_SCALING_FINETUNED,
316
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
314
317
 
315
318
  LLM_KV_SPLIT_NO,
316
319
  LLM_KV_SPLIT_COUNT,
@@ -359,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
359
362
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
360
363
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
361
364
 
362
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
363
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
364
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
365
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
366
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
367
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
368
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
369
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
370
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
371
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
372
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
365
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
366
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
367
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
368
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
369
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
370
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
371
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
372
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
373
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
374
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
375
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
376
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
377
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
378
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
379
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
373
380
 
374
381
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
375
382
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -380,14 +387,18 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
380
387
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
381
388
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
382
389
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
390
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
391
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
383
392
 
384
393
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
385
394
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
386
395
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
387
396
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
388
397
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
398
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
389
399
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
390
400
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
401
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
391
402
 
392
403
  { LLM_KV_SPLIT_NO, "split.no" },
393
404
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -441,6 +452,8 @@ enum llm_tensor {
441
452
  LLM_TENSOR_OUTPUT,
442
453
  LLM_TENSOR_OUTPUT_NORM,
443
454
  LLM_TENSOR_ROPE_FREQS,
455
+ LLM_TENSOR_ROPE_FACTORS_LONG,
456
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
444
457
  LLM_TENSOR_ATTN_Q,
445
458
  LLM_TENSOR_ATTN_K,
446
459
  LLM_TENSOR_ATTN_V,
@@ -460,6 +473,7 @@ enum llm_tensor {
460
473
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
461
474
  LLM_TENSOR_FFN_GATE_EXP,
462
475
  LLM_TENSOR_FFN_UP_EXP,
476
+ LLM_TENSOR_FFN_NORM_EXPS,
463
477
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
464
478
  LLM_TENSOR_FFN_GATE_EXPS,
465
479
  LLM_TENSOR_FFN_UP_EXPS,
@@ -476,6 +490,12 @@ enum llm_tensor {
476
490
  LLM_TENSOR_SSM_A,
477
491
  LLM_TENSOR_SSM_D,
478
492
  LLM_TENSOR_SSM_OUT,
493
+ LLM_TENSOR_ATTN_Q_A,
494
+ LLM_TENSOR_ATTN_Q_B,
495
+ LLM_TENSOR_ATTN_KV_A_MQA,
496
+ LLM_TENSOR_ATTN_KV_B,
497
+ LLM_TENSOR_ATTN_Q_A_NORM,
498
+ LLM_TENSOR_ATTN_KV_A_NORM,
479
499
  };
480
500
 
481
501
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -598,23 +618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
598
618
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
599
619
  },
600
620
  },
601
- {
602
- LLM_ARCH_PERSIMMON,
603
- {
604
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
605
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
606
- { LLM_TENSOR_OUTPUT, "output"},
607
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
608
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
609
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
610
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
611
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
612
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
613
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
614
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
615
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
616
- },
617
- },
618
621
  {
619
622
  LLM_ARCH_MPT,
620
623
  {
@@ -825,18 +828,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
825
828
  {
826
829
  LLM_ARCH_PHI3,
827
830
  {
828
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
- { LLM_TENSOR_OUTPUT, "output" },
831
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
833
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
834
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
835
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
836
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
837
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
838
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
839
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
831
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
832
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
833
+ { LLM_TENSOR_OUTPUT, "output" },
834
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
835
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
836
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
837
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
838
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
839
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
840
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
841
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
842
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
843
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
844
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
840
845
  },
841
846
  },
842
847
  {
@@ -1052,6 +1057,57 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1052
1057
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1053
1058
  },
1054
1059
  },
1060
+ {
1061
+ LLM_ARCH_ARCTIC,
1062
+ {
1063
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1064
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1065
+ { LLM_TENSOR_OUTPUT, "output" },
1066
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1067
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1068
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1069
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1070
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1071
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1072
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1073
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1074
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1075
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1076
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1077
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1078
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1079
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1080
+ },
1081
+ },
1082
+ {
1083
+ LLM_ARCH_DEEPSEEK2,
1084
+ {
1085
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1086
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1087
+ { LLM_TENSOR_OUTPUT, "output" },
1088
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1089
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1090
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1091
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1092
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1093
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1094
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1095
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1096
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1097
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1098
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1099
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1100
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1101
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1102
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1103
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1104
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1105
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1106
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1107
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1108
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1109
+ },
1110
+ },
1055
1111
  {
1056
1112
  LLM_ARCH_UNKNOWN,
1057
1113
  {
@@ -1646,12 +1702,13 @@ struct llama_mlock {
1646
1702
  };
1647
1703
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1648
1704
 
1649
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1705
+ // NOTE: avoid ever using this except for building the token_to_piece caches
1706
+ static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1650
1707
  std::vector<char> result(8, 0);
1651
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1708
+ const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1652
1709
  if (n_tokens < 0) {
1653
1710
  result.resize(-n_tokens);
1654
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1711
+ int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1655
1712
  GGML_ASSERT(check == -n_tokens);
1656
1713
  }
1657
1714
  else {
@@ -1697,6 +1754,8 @@ struct llama_state {
1697
1754
  llama_state() {
1698
1755
  #ifdef GGML_USE_METAL
1699
1756
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1757
+ #elif defined(GGML_USE_CUDA)
1758
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1700
1759
  #endif
1701
1760
  }
1702
1761
 
@@ -1710,23 +1769,31 @@ static llama_state g_state;
1710
1769
  // available llama models
1711
1770
  enum e_model {
1712
1771
  MODEL_UNKNOWN,
1772
+ MODEL_14M,
1713
1773
  MODEL_17M,
1714
1774
  MODEL_22M,
1715
1775
  MODEL_33M,
1776
+ MODEL_70M,
1716
1777
  MODEL_109M,
1717
1778
  MODEL_137M,
1779
+ MODEL_160M,
1718
1780
  MODEL_335M,
1781
+ MODEL_410M,
1719
1782
  MODEL_0_5B,
1720
1783
  MODEL_1B,
1784
+ MODEL_1_4B,
1721
1785
  MODEL_2B,
1786
+ MODEL_2_8B,
1722
1787
  MODEL_3B,
1723
1788
  MODEL_4B,
1789
+ MODEL_6_9B,
1724
1790
  MODEL_7B,
1725
1791
  MODEL_8B,
1726
1792
  MODEL_12B,
1727
1793
  MODEL_13B,
1728
1794
  MODEL_14B,
1729
1795
  MODEL_15B,
1796
+ MODEL_16B,
1730
1797
  MODEL_20B,
1731
1798
  MODEL_30B,
1732
1799
  MODEL_34B,
@@ -1734,6 +1801,7 @@ enum e_model {
1734
1801
  MODEL_40B,
1735
1802
  MODEL_65B,
1736
1803
  MODEL_70B,
1804
+ MODEL_236B,
1737
1805
  MODEL_314B,
1738
1806
  MODEL_SMALL,
1739
1807
  MODEL_MEDIUM,
@@ -1743,6 +1811,7 @@ enum e_model {
1743
1811
  MODEL_8x7B,
1744
1812
  MODEL_8x22B,
1745
1813
  MODEL_16x12B,
1814
+ MODEL_10B_128x3_66B,
1746
1815
  };
1747
1816
 
1748
1817
  static const size_t kiB = 1024;
@@ -1752,6 +1821,7 @@ static const size_t GiB = 1024*MiB;
1752
1821
  struct llama_hparams {
1753
1822
  bool vocab_only;
1754
1823
  bool rope_finetuned;
1824
+ bool use_par_res;
1755
1825
 
1756
1826
  uint32_t n_vocab;
1757
1827
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1767,12 +1837,21 @@ struct llama_hparams {
1767
1837
  uint32_t n_expert_used = 0;
1768
1838
  uint32_t n_vocab_type = 0; // for BERT-style token types
1769
1839
 
1840
+ uint32_t n_layer_dense_lead = 0;
1841
+ uint32_t n_lora_q = 0;
1842
+ uint32_t n_lora_kv = 0;
1843
+ uint32_t n_ff_exp = 0;
1844
+ uint32_t n_expert_shared = 0;
1845
+ float expert_weights_scale = 0.0;
1846
+
1770
1847
  float f_norm_eps;
1771
1848
  float f_norm_rms_eps;
1772
1849
 
1850
+ float rope_attn_factor = 1.0f;
1773
1851
  float rope_freq_base_train;
1774
1852
  float rope_freq_scale_train;
1775
1853
  uint32_t n_yarn_orig_ctx;
1854
+ float rope_yarn_log_mul;
1776
1855
 
1777
1856
  // for State Space Models
1778
1857
  uint32_t ssm_d_conv = 0;
@@ -1806,6 +1885,12 @@ struct llama_hparams {
1806
1885
  if (this->n_expert != other.n_expert) return true;
1807
1886
  if (this->n_expert_used != other.n_expert_used) return true;
1808
1887
 
1888
+ if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
1889
+ if (this->n_lora_q != other.n_lora_q) return true;
1890
+ if (this->n_lora_kv != other.n_lora_kv) return true;
1891
+ if (this->n_ff_exp != other.n_ff_exp) return true;
1892
+ if (this->n_expert_shared != other.n_expert_shared) return true;
1893
+
1809
1894
  if (this->rope_finetuned != other.rope_finetuned) return true;
1810
1895
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1811
1896
 
@@ -1818,8 +1903,11 @@ struct llama_hparams {
1818
1903
 
1819
1904
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1820
1905
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1906
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1821
1907
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1822
1908
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1909
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
1910
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
1823
1911
 
1824
1912
  return false;
1825
1913
  }
@@ -1895,6 +1983,8 @@ struct llama_layer {
1895
1983
  struct ggml_tensor * attn_k_norm_b;
1896
1984
  struct ggml_tensor * attn_out_norm;
1897
1985
  struct ggml_tensor * attn_out_norm_b;
1986
+ struct ggml_tensor * attn_q_a_norm;
1987
+ struct ggml_tensor * attn_kv_a_norm;
1898
1988
 
1899
1989
  // attention
1900
1990
  struct ggml_tensor * wq;
@@ -1902,6 +1992,10 @@ struct llama_layer {
1902
1992
  struct ggml_tensor * wv;
1903
1993
  struct ggml_tensor * wo;
1904
1994
  struct ggml_tensor * wqkv;
1995
+ struct ggml_tensor * wq_a;
1996
+ struct ggml_tensor * wq_b;
1997
+ struct ggml_tensor * wkv_a_mqa;
1998
+ struct ggml_tensor * wkv_b;
1905
1999
 
1906
2000
  // attention bias
1907
2001
  struct ggml_tensor * bq;
@@ -1915,6 +2009,7 @@ struct llama_layer {
1915
2009
  struct ggml_tensor * ffn_norm_b;
1916
2010
  struct ggml_tensor * layer_out_norm;
1917
2011
  struct ggml_tensor * layer_out_norm_b;
2012
+ struct ggml_tensor * ffn_norm_exps;
1918
2013
 
1919
2014
  // ff
1920
2015
  struct ggml_tensor * ffn_gate; // w1
@@ -1934,8 +2029,9 @@ struct llama_layer {
1934
2029
  struct ggml_tensor * ffn_up_shexp;
1935
2030
 
1936
2031
  // ff bias
1937
- struct ggml_tensor * ffn_down_b; // b2
1938
- struct ggml_tensor * ffn_up_b; // b3
2032
+ struct ggml_tensor * ffn_gate_b = nullptr;
2033
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2034
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1939
2035
  struct ggml_tensor * ffn_act;
1940
2036
 
1941
2037
  // mamba proj
@@ -1952,6 +2048,10 @@ struct llama_layer {
1952
2048
  // mamba bias
1953
2049
  struct ggml_tensor * ssm_conv1d_b;
1954
2050
  struct ggml_tensor * ssm_dt_b;
2051
+
2052
+ // long rope factors
2053
+ struct ggml_tensor * rope_long = nullptr;
2054
+ struct ggml_tensor * rope_short = nullptr;
1955
2055
  };
1956
2056
 
1957
2057
  struct llama_kv_cell {
@@ -2063,7 +2163,9 @@ struct llama_vocab {
2063
2163
  std::unordered_map<token, id> token_to_id;
2064
2164
  std::vector<token_data> id_to_token;
2065
2165
 
2066
- std::unordered_map<token, id> special_tokens_cache;
2166
+ std::vector<id> cache_special_tokens;
2167
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
+ std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2067
2169
 
2068
2170
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2069
2171
 
@@ -2268,10 +2370,6 @@ struct llama_context {
2268
2370
 
2269
2371
  // control vectors
2270
2372
  struct llama_control_vector cvec;
2271
-
2272
- #ifdef GGML_USE_MPI
2273
- ggml_mpi_context * ctx_mpi = NULL;
2274
- #endif
2275
2373
  };
2276
2374
 
2277
2375
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
@@ -2491,7 +2589,6 @@ static bool llama_kv_cache_init(
2491
2589
  static bool llama_kv_cache_find_slot(
2492
2590
  struct llama_kv_cache & cache,
2493
2591
  const struct llama_batch & batch) {
2494
- const uint32_t n_ctx = cache.size;
2495
2592
  const uint32_t n_tokens = batch.n_tokens;
2496
2593
 
2497
2594
  if (cache.recurrent) {
@@ -2542,16 +2639,16 @@ static bool llama_kv_cache_find_slot(
2542
2639
  }
2543
2640
  // otherwise, one cell per token.
2544
2641
 
2545
- if (n_tokens > n_ctx) {
2546
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2642
+ if (n_tokens > cache.size) {
2643
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2547
2644
  return false;
2548
2645
  }
2549
2646
 
2550
2647
  uint32_t n_tested = 0;
2551
2648
 
2552
2649
  while (true) {
2553
- if (cache.head + n_tokens > n_ctx) {
2554
- n_tested += n_ctx - cache.head;
2650
+ if (cache.head + n_tokens > cache.size) {
2651
+ n_tested += cache.size - cache.head;
2555
2652
  cache.head = 0;
2556
2653
  continue;
2557
2654
  }
@@ -2570,7 +2667,7 @@ static bool llama_kv_cache_find_slot(
2570
2667
  break;
2571
2668
  }
2572
2669
 
2573
- if (n_tested >= n_ctx) {
2670
+ if (n_tested >= cache.size) {
2574
2671
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2575
2672
  return false;
2576
2673
  }
@@ -3330,6 +3427,39 @@ struct llama_model_loader {
3330
3427
  return get_arr_n(llm_kv(kid), result, required);
3331
3428
  }
3332
3429
 
3430
+ template<typename T>
3431
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3432
+ const int kid = gguf_find_key(meta, key.c_str());
3433
+
3434
+ if (kid < 0) {
3435
+ if (required) {
3436
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3437
+ }
3438
+ return false;
3439
+ }
3440
+
3441
+ struct GGUFMeta::ArrayInfo arr_info =
3442
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3443
+
3444
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3445
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3446
+ }
3447
+
3448
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3449
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3450
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3451
+
3452
+ result.resize(arr_info.length);
3453
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3454
+
3455
+ return true;
3456
+ }
3457
+
3458
+ template<typename T>
3459
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3460
+ return get_arr(llm_kv(kid), result, required);
3461
+ }
3462
+
3333
3463
  template<typename T>
3334
3464
  bool get_key(const std::string & key, T & result, const bool required = true) {
3335
3465
  auto it = kv_overrides.find(key);
@@ -3404,11 +3534,15 @@ struct llama_model_loader {
3404
3534
  return get_tensor_meta(get_tensor_name(i));
3405
3535
  }
3406
3536
 
3407
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3537
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3408
3538
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3409
3539
  ggml_set_name(tensor, ggml_get_name(cur));
3410
3540
 
3411
- n_created++;
3541
+ if (duplicated) {
3542
+ size_data += ggml_nbytes(cur);
3543
+ } else {
3544
+ n_created++;
3545
+ }
3412
3546
 
3413
3547
  return tensor;
3414
3548
  }
@@ -3443,14 +3577,17 @@ struct llama_model_loader {
3443
3577
  return cur;
3444
3578
  }
3445
3579
 
3446
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3447
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3580
+ static const int TENSOR_NOT_REQUIRED = 1;
3581
+ static const int TENSOR_DUPLICATED = 2;
3582
+
3583
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3584
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3448
3585
 
3449
3586
  if (cur == NULL) {
3450
3587
  return NULL;
3451
3588
  }
3452
3589
 
3453
- return create_tensor_for(ctx, cur);
3590
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3454
3591
  }
3455
3592
 
3456
3593
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3750,37 +3887,50 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3750
3887
 
3751
3888
  static const char * llama_model_type_name(e_model type) {
3752
3889
  switch (type) {
3753
- case MODEL_22M: return "22M";
3754
- case MODEL_33M: return "33M";
3755
- case MODEL_109M: return "109M";
3756
- case MODEL_137M: return "137M";
3757
- case MODEL_0_5B: return "0.5B";
3758
- case MODEL_1B: return "1B";
3759
- case MODEL_2B: return "2B";
3760
- case MODEL_3B: return "3B";
3761
- case MODEL_7B: return "7B";
3762
- case MODEL_8B: return "8B";
3763
- case MODEL_12B: return "12B";
3764
- case MODEL_13B: return "13B";
3765
- case MODEL_14B: return "14B";
3766
- case MODEL_15B: return "15B";
3767
- case MODEL_20B: return "20B";
3768
- case MODEL_30B: return "30B";
3769
- case MODEL_34B: return "34B";
3770
- case MODEL_35B: return "35B";
3771
- case MODEL_40B: return "40B";
3772
- case MODEL_65B: return "65B";
3773
- case MODEL_70B: return "70B";
3774
- case MODEL_314B: return "314B";
3775
- case MODEL_SMALL: return "0.1B";
3776
- case MODEL_MEDIUM: return "0.4B";
3777
- case MODEL_LARGE: return "0.8B";
3778
- case MODEL_XL: return "1.5B";
3779
- case MODEL_A2_7B: return "A2.7B";
3780
- case MODEL_8x7B: return "8x7B";
3781
- case MODEL_8x22B: return "8x22B";
3782
- case MODEL_16x12B: return "16x12B";
3783
- default: return "?B";
3890
+ case MODEL_14M: return "14M";
3891
+ case MODEL_17M: return "17M";
3892
+ case MODEL_22M: return "22M";
3893
+ case MODEL_33M: return "33M";
3894
+ case MODEL_70M: return "70M";
3895
+ case MODEL_109M: return "109M";
3896
+ case MODEL_137M: return "137M";
3897
+ case MODEL_160M: return "160M";
3898
+ case MODEL_335M: return "335M";
3899
+ case MODEL_410M: return "410M";
3900
+ case MODEL_0_5B: return "0.5B";
3901
+ case MODEL_1B: return "1B";
3902
+ case MODEL_1_4B: return "1.4B";
3903
+ case MODEL_2B: return "2B";
3904
+ case MODEL_2_8B: return "2.8B";
3905
+ case MODEL_3B: return "3B";
3906
+ case MODEL_4B: return "4B";
3907
+ case MODEL_6_9B: return "6.9B";
3908
+ case MODEL_7B: return "7B";
3909
+ case MODEL_8B: return "8B";
3910
+ case MODEL_12B: return "12B";
3911
+ case MODEL_13B: return "13B";
3912
+ case MODEL_14B: return "14B";
3913
+ case MODEL_15B: return "15B";
3914
+ case MODEL_16B: return "16B";
3915
+ case MODEL_20B: return "20B";
3916
+ case MODEL_30B: return "30B";
3917
+ case MODEL_34B: return "34B";
3918
+ case MODEL_35B: return "35B";
3919
+ case MODEL_40B: return "40B";
3920
+ case MODEL_65B: return "65B";
3921
+ case MODEL_70B: return "70B";
3922
+ case MODEL_236B: return "236B";
3923
+ case MODEL_314B: return "314B";
3924
+ case MODEL_SMALL: return "0.1B";
3925
+ case MODEL_MEDIUM: return "0.4B";
3926
+ case MODEL_LARGE: return "0.8B";
3927
+ case MODEL_XL: return "1.5B";
3928
+ case MODEL_A2_7B: return "A2.7B";
3929
+ case MODEL_8x7B: return "8x7B";
3930
+ case MODEL_8x22B: return "8x22B";
3931
+ case MODEL_16x12B: return "16x12B";
3932
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3933
+ default: return "?B";
3784
3934
  }
3785
3935
  }
3786
3936
 
@@ -3873,6 +4023,8 @@ static void llm_load_hparams(
3873
4023
  }
3874
4024
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3875
4025
 
4026
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
4027
+
3876
4028
  // sanity check for n_rot (optional)
3877
4029
  {
3878
4030
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3910,7 +4062,9 @@ static void llm_load_hparams(
3910
4062
  switch (hparams.n_layer) {
3911
4063
  case 22: model.type = e_model::MODEL_1B; break;
3912
4064
  case 26: model.type = e_model::MODEL_3B; break;
3913
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4065
+ // granite uses a vocab with len 49152
4066
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4067
+ case 36: model.type = e_model::MODEL_8B; break; // granite
3914
4068
  case 40: model.type = e_model::MODEL_13B; break;
3915
4069
  case 48: model.type = e_model::MODEL_34B; break;
3916
4070
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3972,14 +4126,6 @@ static void llm_load_hparams(
3972
4126
  default: model.type = e_model::MODEL_UNKNOWN;
3973
4127
  }
3974
4128
  } break;
3975
- case LLM_ARCH_PERSIMMON:
3976
- {
3977
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3978
- switch (hparams.n_layer) {
3979
- case 36: model.type = e_model::MODEL_8B; break;
3980
- default: model.type = e_model::MODEL_UNKNOWN;
3981
- }
3982
- } break;
3983
4129
  case LLM_ARCH_REFACT:
3984
4130
  {
3985
4131
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -4121,6 +4267,7 @@ static void llm_load_hparams(
4121
4267
  switch (hparams.n_layer) {
4122
4268
  case 24: model.type = e_model::MODEL_1B; break;
4123
4269
  case 32: model.type = e_model::MODEL_3B; break;
4270
+ case 40: model.type = e_model::MODEL_14B; break;
4124
4271
  default: model.type = e_model::MODEL_UNKNOWN;
4125
4272
  }
4126
4273
  } break;
@@ -4187,6 +4334,8 @@ static void llm_load_hparams(
4187
4334
  case 30: model.type = e_model::MODEL_3B; break;
4188
4335
  case 32: model.type = e_model::MODEL_7B; break;
4189
4336
  case 40: model.type = e_model::MODEL_15B; break;
4337
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4338
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4190
4339
  default: model.type = e_model::MODEL_UNKNOWN;
4191
4340
  }
4192
4341
  } break;
@@ -4261,6 +4410,85 @@ static void llm_load_hparams(
4261
4410
  default: model.type = e_model::MODEL_UNKNOWN;
4262
4411
  }
4263
4412
  } break;
4413
+ case LLM_ARCH_GPTNEOX:
4414
+ {
4415
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4416
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4417
+ switch (hparams.n_layer) {
4418
+ case 6:
4419
+ switch (hparams.n_ff) {
4420
+ case 512: model.type = e_model::MODEL_14M; break;
4421
+ case 2048: model.type = e_model::MODEL_70M; break;
4422
+ default: model.type = e_model::MODEL_UNKNOWN;
4423
+ } break;
4424
+ case 12:
4425
+ switch (hparams.n_ff) {
4426
+ case 3072: model.type = e_model::MODEL_160M; break;
4427
+ default: model.type = e_model::MODEL_UNKNOWN;
4428
+ } break;
4429
+ case 16:
4430
+ switch (hparams.n_ff) {
4431
+ case 8192: model.type = e_model::MODEL_1B; break;
4432
+ default: model.type = e_model::MODEL_UNKNOWN;
4433
+ } break;
4434
+ case 24:
4435
+ switch (hparams.n_ff) {
4436
+ case 4096: model.type = e_model::MODEL_410M; break;
4437
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4438
+ default: model.type = e_model::MODEL_UNKNOWN;
4439
+ } break;
4440
+ case 32:
4441
+ switch (hparams.n_ff) {
4442
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4443
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4444
+ default: model.type = e_model::MODEL_UNKNOWN;
4445
+ } break;
4446
+ case 36:
4447
+ switch (hparams.n_ff) {
4448
+ case 20480: model.type = e_model::MODEL_12B; break;
4449
+ default: model.type = e_model::MODEL_UNKNOWN;
4450
+ } break;
4451
+ case 44:
4452
+ switch (hparams.n_ff) {
4453
+ case 24576: model.type = e_model::MODEL_20B; break;
4454
+ default: model.type = e_model::MODEL_UNKNOWN;
4455
+ } break;
4456
+ default: model.type = e_model::MODEL_UNKNOWN;
4457
+ }
4458
+ } break;
4459
+ case LLM_ARCH_ARCTIC:
4460
+ {
4461
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4462
+
4463
+ if (hparams.n_expert == 128) {
4464
+ switch (hparams.n_layer) {
4465
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4466
+ default: model.type = e_model::MODEL_UNKNOWN;
4467
+ }
4468
+ } else {
4469
+ model.type = e_model::MODEL_UNKNOWN;
4470
+ }
4471
+ } break;
4472
+ case LLM_ARCH_DEEPSEEK2:
4473
+ {
4474
+ bool is_lite = (hparams.n_layer == 27);
4475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
4477
+ if (!is_lite) {
4478
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
4479
+ }
4480
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
4481
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
4482
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
4483
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
4484
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
4485
+
4486
+ switch (hparams.n_layer) {
4487
+ case 27: model.type = e_model::MODEL_16B; break;
4488
+ case 60: model.type = e_model::MODEL_236B; break;
4489
+ default: model.type = e_model::MODEL_UNKNOWN;
4490
+ }
4491
+ } break;
4264
4492
  default: (void)0;
4265
4493
  }
4266
4494
 
@@ -4367,15 +4595,14 @@ static void llm_load_vocab(
4367
4595
  vocab.special_cls_id = 101;
4368
4596
  vocab.special_mask_id = 103;
4369
4597
  vocab.add_space_prefix = false;
4370
- } else {
4371
- if (tokenizer_model == "gpt2") {
4372
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4373
- } else {
4374
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4375
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4376
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4377
- return;
4598
+ } else if (tokenizer_model == "gpt2") {
4599
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4600
+
4601
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4602
+ if (add_space_prefix_keyidx != -1) {
4603
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4378
4604
  }
4605
+
4379
4606
  // read bpe merges and populate bpe ranks
4380
4607
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4381
4608
  if (merges_keyidx == -1) {
@@ -4409,6 +4636,8 @@ static void llm_load_vocab(
4409
4636
  vocab.special_pad_id = -1;
4410
4637
  vocab.special_cls_id = -1;
4411
4638
  vocab.special_mask_id = -1;
4639
+ } else {
4640
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4412
4641
  }
4413
4642
 
4414
4643
  // for now, only BPE models have pre-tokenizers
@@ -4461,12 +4690,18 @@ static void llm_load_vocab(
4461
4690
  } else if (
4462
4691
  tokenizer_pre == "qwen2") {
4463
4692
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4693
+ } else if (
4694
+ tokenizer_pre == "stablelm2") {
4695
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4464
4696
  } else if (
4465
4697
  tokenizer_pre == "olmo") {
4466
4698
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4467
4699
  } else if (
4468
4700
  tokenizer_pre == "dbrx") {
4469
4701
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4702
+ } else if (
4703
+ tokenizer_pre == "smaug-bpe") {
4704
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4470
4705
  } else {
4471
4706
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4472
4707
  }
@@ -4582,7 +4817,8 @@ static void llm_load_vocab(
4582
4817
  (t.first == "<|eot_id|>" ||
4583
4818
  t.first == "<|im_end|>" ||
4584
4819
  t.first == "<|end|>" ||
4585
- t.first == "<end_of_turn>"
4820
+ t.first == "<end_of_turn>" ||
4821
+ t.first == "<|endoftext|>"
4586
4822
  )
4587
4823
  ) {
4588
4824
  vocab.special_eot_id = t.second;
@@ -4594,97 +4830,40 @@ static void llm_load_vocab(
4594
4830
 
4595
4831
  // build special tokens cache
4596
4832
  {
4597
- // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
4598
- // and will always be correctly labeled in 'added_tokens.json' etc.
4599
- // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
4600
- // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
4601
- // are special tokens.
4602
- // From testing, this appears to correlate 1:1 with special tokens.
4603
- //
4604
-
4605
- // Counting special tokens and verifying in only one direction
4606
- // is sufficient to detect difference in those two sets.
4607
- //
4608
- uint32_t special_tokens_count_by_type = 0;
4609
- uint32_t special_tokens_count_from_verification = 0;
4610
-
4611
- bool special_tokens_definition_mismatch = false;
4612
-
4613
- for (const auto & t : vocab.token_to_id) {
4614
- const auto & token = t.first;
4615
- const auto & id = t.second;
4616
-
4617
- // Count all non-normal tokens in the vocab while iterating
4833
+ for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4618
4834
  if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4619
- special_tokens_count_by_type++;
4835
+ vocab.cache_special_tokens.push_back(id);
4620
4836
  }
4837
+ }
4621
4838
 
4622
- // Skip single character tokens
4623
- if (token.length() > 1) {
4624
- bool is_tokenizable = false;
4625
-
4626
- // Split token string representation in two, in all possible ways
4627
- // and check if both halves can be matched to a valid token
4628
- for (unsigned i = 1; i < token.length();) {
4629
- const auto left = token.substr(0, i);
4630
- const auto right = token.substr(i);
4631
-
4632
- // check if we didnt partition in the middle of a utf sequence
4633
- auto utf = utf8_len(left.at(left.length() - 1));
4634
-
4635
- if (utf == 1) {
4636
- if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
4637
- vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
4638
- is_tokenizable = true;
4639
- break;
4640
- }
4641
- i++;
4642
- } else {
4643
- // skip over the rest of multibyte utf sequence
4644
- i += utf - 1;
4645
- }
4646
- }
4839
+ std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
4840
+ [&] (const llama_vocab::id a, const llama_vocab::id b) {
4841
+ return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
4842
+ }
4843
+ );
4647
4844
 
4648
- if (!is_tokenizable) {
4649
- // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
4650
- // it's faster to re-filter them here, since there are way less candidates now
4845
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
+ }
4651
4847
 
4652
- // Calculate a total "utf" length of a token string representation
4653
- size_t utf8_str_len = 0;
4654
- for (unsigned i = 0; i < token.length();) {
4655
- utf8_str_len++;
4656
- i += utf8_len(token.at(i));
4657
- }
4848
+ // build token to piece caches
4849
+ {
4850
+ size_t size_cache = 0;
4658
4851
 
4659
- // And skip the ones which are one character
4660
- if (utf8_str_len > 1) {
4661
- // At this point what we have left are special tokens only
4662
- vocab.special_tokens_cache[token] = id;
4852
+ std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
+ std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4663
4854
 
4664
- // Count manually found special tokens
4665
- special_tokens_count_from_verification++;
4855
+ for (uint32_t id = 0; id < n_vocab; ++id) {
4856
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
+ cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4666
4858
 
4667
- // If this manually found special token is not marked as such, flag a mismatch
4668
- if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
4669
- special_tokens_definition_mismatch = true;
4670
- }
4671
- }
4672
- }
4673
- }
4859
+ size_cache += cache_token_to_piece[id].size();
4860
+ size_cache += cache_token_to_piece_special[id].size();
4674
4861
  }
4675
4862
 
4676
- if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
4677
- LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
4678
- __func__,
4679
- special_tokens_count_from_verification, vocab.id_to_token.size(),
4680
- special_tokens_count_by_type, vocab.id_to_token.size()
4681
- );
4682
- } else {
4683
- LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
4684
- __func__,
4685
- special_tokens_count_from_verification, vocab.id_to_token.size()
4686
- );
4687
- }
4863
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
+ std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4865
+
4866
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4688
4867
  }
4689
4868
  }
4690
4869
 
@@ -4765,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4765
4944
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4766
4945
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4767
4946
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4947
+
4948
+ if (model.arch == LLM_ARCH_DEEPSEEK2) {
4949
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4950
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
4951
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4952
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4953
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4954
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4955
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
4956
+ }
4768
4957
  }
4769
4958
 
4770
4959
  // Returns false if cancelled by progress_callback
@@ -4908,6 +5097,7 @@ static bool llm_load_tensors(
4908
5097
  // create tensors for the weights
4909
5098
  {
4910
5099
  const int64_t n_embd = hparams.n_embd;
5100
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4911
5101
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4912
5102
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4913
5103
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4920,8 +5110,6 @@ static bool llm_load_tensors(
4920
5110
  throw std::runtime_error("model has expert layers but no expert layers are used");
4921
5111
  }
4922
5112
 
4923
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4924
-
4925
5113
  ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
4926
5114
  ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
4927
5115
  ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@@ -4942,12 +5130,10 @@ static bool llm_load_tensors(
4942
5130
  {
4943
5131
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4944
5132
  if (model.arch != LLM_ARCH_MINICPM){
4945
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5133
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4946
5134
  // if output is NULL, init from the input tok embed
4947
5135
  if (model.output == NULL) {
4948
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4949
- ml.n_created--; // artificial tensor
4950
- ml.size_data += ggml_nbytes(model.output);
5136
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4951
5137
  }
4952
5138
  }
4953
5139
  }
@@ -4966,10 +5152,10 @@ static bool llm_load_tensors(
4966
5152
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4967
5153
 
4968
5154
  // optional bias tensors
4969
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4970
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4971
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4972
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5155
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5157
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5158
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4973
5159
 
4974
5160
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4975
5161
 
@@ -4977,10 +5163,15 @@ static bool llm_load_tensors(
4977
5163
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4978
5164
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4979
5165
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5166
+
5167
+ // optional MLP bias
5168
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5169
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5170
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
4980
5171
  } else {
4981
5172
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4982
5173
 
4983
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5174
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4984
5175
  if (layer.ffn_gate_exps) {
4985
5176
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4986
5177
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5022,12 +5213,10 @@ static bool llm_load_tensors(
5022
5213
  // output
5023
5214
  {
5024
5215
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5025
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5216
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5026
5217
  // if output is NULL, init from the input tok embed
5027
5218
  if (model.output == NULL) {
5028
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5029
- ml.n_created--; // artificial tensor
5030
- ml.size_data += ggml_nbytes(model.output);
5219
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5031
5220
  }
5032
5221
  }
5033
5222
 
@@ -5050,7 +5239,7 @@ static bool llm_load_tensors(
5050
5239
 
5051
5240
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5052
5241
 
5053
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5242
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
5054
5243
  if (layer.ffn_gate_exps) {
5055
5244
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
5056
5245
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5152,11 +5341,9 @@ static bool llm_load_tensors(
5152
5341
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5153
5342
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5154
5343
 
5155
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5344
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
5345
  if (!model.output) {
5157
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5158
- ml.n_created--; // artificial tensor
5159
- ml.size_data += ggml_nbytes(model.output);
5346
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5160
5347
  }
5161
5348
  }
5162
5349
 
@@ -5169,8 +5356,8 @@ static bool llm_load_tensors(
5169
5356
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5170
5357
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5171
5358
 
5172
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5173
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5359
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5360
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5174
5361
 
5175
5362
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5176
5363
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5188,7 +5375,12 @@ static bool llm_load_tensors(
5188
5375
  {
5189
5376
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5190
5377
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5191
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5378
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5379
+ if (!model.output) {
5380
+ // needs to be on GPU
5381
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5382
+ }
5383
+
5192
5384
  }
5193
5385
 
5194
5386
  for (int i = 0; i < n_layer; ++i) {
@@ -5216,47 +5408,6 @@ static bool llm_load_tensors(
5216
5408
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5217
5409
  }
5218
5410
  } break;
5219
- case LLM_ARCH_PERSIMMON:
5220
- {
5221
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5222
-
5223
- {
5224
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5225
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5226
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5227
- }
5228
-
5229
- for (int i = 0; i < n_layer; ++i) {
5230
- ggml_context * ctx_layer = ctx_for_layer(i);
5231
- ggml_context * ctx_split = ctx_for_layer_split(i);
5232
-
5233
- auto & layer = model.layers[i];
5234
-
5235
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5236
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5237
-
5238
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5239
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5240
-
5241
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5242
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5243
-
5244
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5245
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5246
-
5247
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5248
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5249
-
5250
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5251
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5252
-
5253
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5254
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5255
-
5256
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5257
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5258
- }
5259
- } break;
5260
5411
  case LLM_ARCH_BERT:
5261
5412
  case LLM_ARCH_NOMIC_BERT:
5262
5413
  {
@@ -5325,14 +5476,14 @@ static bool llm_load_tensors(
5325
5476
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
5477
  layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
5478
 
5328
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5479
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5480
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5481
 
5331
5482
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
5483
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
5484
 
5334
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5485
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5486
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5336
5487
 
5337
5488
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
5489
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
@@ -5394,18 +5545,16 @@ static bool llm_load_tensors(
5394
5545
  case LLM_ARCH_MPT:
5395
5546
  {
5396
5547
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5397
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5548
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5398
5549
 
5399
5550
  // output
5400
5551
  {
5401
5552
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5402
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5553
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5403
5554
 
5404
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5555
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5405
5556
  if (!model.output) {
5406
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5407
- ml.n_created--; // artificial tensor
5408
- ml.size_data += ggml_nbytes(model.output);
5557
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5409
5558
  }
5410
5559
  }
5411
5560
 
@@ -5416,31 +5565,31 @@ static bool llm_load_tensors(
5416
5565
  auto & layer = model.layers[i];
5417
5566
 
5418
5567
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5419
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5568
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5420
5569
 
5421
5570
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5422
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5571
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5423
5572
 
5424
5573
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5425
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5574
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5426
5575
 
5427
5576
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5428
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5577
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5429
5578
 
5430
5579
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5431
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5580
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5432
5581
 
5433
5582
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5434
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5583
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5435
5584
 
5436
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5437
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5585
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5586
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5438
5587
 
5439
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5440
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5588
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5589
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5441
5590
 
5442
5591
  // AWQ ScaleActivation layer
5443
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5592
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5444
5593
  }
5445
5594
  } break;
5446
5595
  case LLM_ARCH_STABLELM:
@@ -5469,17 +5618,17 @@ static bool llm_load_tensors(
5469
5618
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5470
5619
 
5471
5620
  // optional bias tensors, present in Stable LM 2 1.6B
5472
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5473
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5474
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5621
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5622
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5623
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5475
5624
 
5476
5625
  // optional q and k layernorms, present in StableLM 2 12B
5477
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5478
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5626
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5627
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5479
5628
 
5480
5629
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5481
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5482
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5630
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5631
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5483
5632
 
5484
5633
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5485
5634
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5522,12 +5671,10 @@ static bool llm_load_tensors(
5522
5671
  // output
5523
5672
  {
5524
5673
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5525
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5674
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5526
5675
  // if output is NULL, init from the input tok embed
5527
5676
  if (model.output == NULL) {
5528
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5529
- ml.n_created--; // artificial tensor
5530
- ml.size_data += ggml_nbytes(model.output);
5677
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5531
5678
  }
5532
5679
  }
5533
5680
 
@@ -5625,8 +5772,8 @@ static bool llm_load_tensors(
5625
5772
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5626
5773
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5627
5774
 
5628
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5629
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5775
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5776
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5630
5777
 
5631
5778
  if (layer.wqkv == nullptr) {
5632
5779
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5663,17 +5810,20 @@ static bool llm_load_tensors(
5663
5810
  ggml_context* ctx_layer = ctx_for_layer(i);
5664
5811
  ggml_context* ctx_split = ctx_for_layer_split(i);
5665
5812
 
5666
- auto& layer = model.layers[i];
5813
+ auto & layer = model.layers[i];
5667
5814
 
5668
5815
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5669
5816
 
5670
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5671
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5817
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5818
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5672
5819
 
5673
5820
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5674
5821
 
5675
5822
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5676
5823
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5824
+
5825
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5826
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5677
5827
  }
5678
5828
  } break;
5679
5829
  case LLM_ARCH_PLAMO:
@@ -5842,9 +5992,7 @@ static bool llm_load_tensors(
5842
5992
 
5843
5993
  // output
5844
5994
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5845
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5846
- ml.n_created--; // artificial tensor
5847
- ml.size_data += ggml_nbytes(model.output);
5995
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5848
5996
 
5849
5997
  const int64_t n_ff = hparams.n_ff;
5850
5998
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5879,12 +6027,10 @@ static bool llm_load_tensors(
5879
6027
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5880
6028
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5881
6029
 
5882
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6030
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5883
6031
  // if output is NULL, init from the input tok embed
5884
6032
  if (model.output == NULL) {
5885
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5886
- ml.n_created--; // artificial tensor
5887
- ml.size_data += ggml_nbytes(model.output);
6033
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5888
6034
  }
5889
6035
 
5890
6036
  }
@@ -5935,12 +6081,10 @@ static bool llm_load_tensors(
5935
6081
  {
5936
6082
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5937
6083
 
5938
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6084
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5939
6085
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5940
6086
  if (model.output == NULL) {
5941
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5942
- ml.n_created--; // artificial tensor
5943
- ml.size_data += ggml_nbytes(model.output);
6087
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5944
6088
  }
5945
6089
  }
5946
6090
 
@@ -6001,9 +6145,7 @@ static bool llm_load_tensors(
6001
6145
  {
6002
6146
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6003
6147
  // init output from the input tok embed
6004
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6005
- ml.n_created--; // artificial tensor
6006
- ml.size_data += ggml_nbytes(model.output);
6148
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6007
6149
  }
6008
6150
 
6009
6151
  for (int i = 0; i < n_layer; ++i) {
@@ -6035,12 +6177,10 @@ static bool llm_load_tensors(
6035
6177
 
6036
6178
  // output
6037
6179
  {
6038
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6180
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6039
6181
  // if output is NULL, init from the input tok embed
6040
6182
  if (model.output == NULL) {
6041
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6042
- ml.n_created--; // artificial tensor
6043
- ml.size_data += ggml_nbytes(model.output);
6183
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6044
6184
  }
6045
6185
  }
6046
6186
 
@@ -6060,30 +6200,169 @@ static bool llm_load_tensors(
6060
6200
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6061
6201
  }
6062
6202
  } break;
6063
- default:
6064
- throw std::runtime_error("unknown architecture");
6065
- }
6066
- }
6203
+ case LLM_ARCH_GPTNEOX:
6204
+ {
6205
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6206
+ // output
6207
+ {
6208
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6209
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6210
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6211
+ }
6067
6212
 
6068
- ml.done_getting_tensors();
6213
+ for (int i = 0; i < n_layer; ++i) {
6214
+ ggml_context * ctx_layer = ctx_for_layer(i);
6215
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6069
6216
 
6070
- ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
6071
- model.mappings.reserve(ml.mappings.size());
6217
+ auto & layer = model.layers[i];
6072
6218
 
6073
- // create the backend buffers
6074
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6075
- ctx_bufs.reserve(ctx_map.size());
6219
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6220
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6076
6221
 
6077
- // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6078
- size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6079
- model.bufs.reserve(n_max_backend_buffer);
6222
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6223
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6080
6224
 
6081
- for (auto & it : ctx_map) {
6082
- ggml_backend_buffer_type_t buft = it.first;
6083
- ggml_context * ctx = it.second;
6225
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6226
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6084
6227
 
6085
- llama_buf_map bufs;
6086
- bufs.reserve(n_max_backend_buffer);
6228
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6229
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6230
+
6231
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6232
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6233
+
6234
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6235
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6236
+ }
6237
+ } break;
6238
+ case LLM_ARCH_ARCTIC:
6239
+ {
6240
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6241
+
6242
+ // output
6243
+ {
6244
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6245
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6246
+ // if output is NULL, init from the input tok embed
6247
+ if (model.output == NULL) {
6248
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6249
+ }
6250
+ }
6251
+
6252
+ for (int i = 0; i < n_layer; ++i) {
6253
+ ggml_context * ctx_layer = ctx_for_layer(i);
6254
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6255
+
6256
+ auto & layer = model.layers[i];
6257
+
6258
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6259
+
6260
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6261
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6262
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6263
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6264
+
6265
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6266
+
6267
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6268
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6269
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6270
+
6271
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6272
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6273
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6274
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6275
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6276
+ }
6277
+ } break;
6278
+ case LLM_ARCH_DEEPSEEK2:
6279
+ {
6280
+ bool is_lite = (hparams.n_layer == 27);
6281
+
6282
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
6283
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6284
+ const uint32_t q_lora_rank = hparams.n_lora_q;
6285
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
6286
+ const uint32_t n_ff_exp = hparams.n_ff_exp;
6287
+
6288
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6289
+
6290
+ // output
6291
+ {
6292
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6293
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6294
+ }
6295
+
6296
+ for (int i = 0; i < n_layer; ++i) {
6297
+ ggml_context * ctx_layer = ctx_for_layer(i);
6298
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6299
+
6300
+ auto & layer = model.layers[i];
6301
+
6302
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6303
+ if (!is_lite) {
6304
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
6305
+ }
6306
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
6307
+
6308
+ if (!is_lite) {
6309
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
6310
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
6311
+ } else {
6312
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
6313
+ }
6314
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
6315
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
6316
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
6317
+
6318
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6319
+
6320
+ if ((uint32_t) i < hparams.n_layer_dense_lead) {
6321
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6322
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
6323
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6324
+ } else {
6325
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6326
+
6327
+ GGML_ASSERT(hparams.n_expert > 0);
6328
+ GGML_ASSERT(hparams.n_expert_used > 0);
6329
+
6330
+ // MoE branch
6331
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6332
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6333
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6334
+
6335
+ // Shared expert branch
6336
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6337
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
6338
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6339
+ }
6340
+ }
6341
+ } break;
6342
+ default:
6343
+ throw std::runtime_error("unknown architecture");
6344
+ }
6345
+ }
6346
+
6347
+ ml.done_getting_tensors();
6348
+
6349
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
6350
+ model.mappings.reserve(ml.mappings.size());
6351
+
6352
+ // create the backend buffers
6353
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6354
+ ctx_bufs.reserve(ctx_map.size());
6355
+
6356
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6357
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6358
+ model.bufs.reserve(n_max_backend_buffer);
6359
+
6360
+ for (auto & it : ctx_map) {
6361
+ ggml_backend_buffer_type_t buft = it.first;
6362
+ ggml_context * ctx = it.second;
6363
+
6364
+ llama_buf_map bufs;
6365
+ bufs.reserve(n_max_backend_buffer);
6087
6366
 
6088
6367
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
6089
6368
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -6324,10 +6603,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6324
6603
 
6325
6604
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6326
6605
  } else {
6327
- #ifdef GGML_USE_MPI
6328
- GGML_ASSERT(false && "not implemented");
6329
- #endif
6330
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6606
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6331
6607
  inpL = lctx.inp_embd;
6332
6608
  ggml_set_input(lctx.inp_embd);
6333
6609
  }
@@ -6517,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
6517
6793
  int64_t n_expert_used,
6518
6794
  llm_ffn_op_type type_op,
6519
6795
  bool norm_w,
6796
+ bool scale_w,
6797
+ float w_scale,
6520
6798
  const llm_build_cb & cb,
6521
6799
  int il) {
6522
6800
  int64_t n_embd = cur->ne[0];
@@ -6548,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
6548
6826
 
6549
6827
  weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6550
6828
  }
6829
+ if (scale_w) {
6830
+ weights = ggml_scale(ctx, weights, w_scale);
6831
+ cb(weights, "ffn_moe_weights_scaled", il);
6832
+ }
6551
6833
 
6552
6834
  cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6553
6835
  ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -6652,7 +6934,7 @@ static struct ggml_tensor * llm_build_kqv(
6652
6934
 
6653
6935
  cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6654
6936
 
6655
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6937
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6656
6938
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6657
6939
  }
6658
6940
 
@@ -6661,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
6661
6943
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6662
6944
  cb(kq, "kq", il);
6663
6945
 
6664
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6946
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6665
6947
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6666
6948
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6667
6949
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6886,17 +7168,20 @@ struct llm_build_context {
6886
7168
  cb(lctx.inp_K_shift, "K_shift", -1);
6887
7169
  ggml_set_input(lctx.inp_K_shift);
6888
7170
 
7171
+
6889
7172
  for (int il = 0; il < n_layer; ++il) {
7173
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6890
7174
  struct ggml_tensor * tmp =
6891
7175
  // we rotate only the first n_rot dimensions
6892
- ggml_rope_custom_inplace(ctx0,
7176
+ ggml_rope_ext_inplace(ctx0,
6893
7177
  ggml_view_3d(ctx0, kv_self.k_l[il],
6894
7178
  n_embd_head_k, n_head_kv, n_ctx,
6895
7179
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6896
7180
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6897
7181
  0),
6898
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7182
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6899
7183
  ext_factor, attn_factor, beta_fast, beta_slow);
7184
+
6900
7185
  cb(tmp, "K_shifted", il);
6901
7186
  ggml_build_forward_expand(gf, tmp);
6902
7187
  }
@@ -6999,6 +7284,17 @@ struct llm_build_context {
6999
7284
  return lctx.inp_pos;
7000
7285
  }
7001
7286
 
7287
+ struct ggml_tensor * build_rope_factors(int il) {
7288
+ // choose long/short freq factors based on the context size
7289
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7290
+
7291
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7292
+ return model.layers[il].rope_long;
7293
+ }
7294
+
7295
+ return model.layers[il].rope_short;
7296
+ }
7297
+
7002
7298
  struct ggml_tensor * build_inp_out_ids() {
7003
7299
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
7004
7300
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -7106,15 +7402,15 @@ struct llm_build_context {
7106
7402
  cb(Vcur, "Vcur", il);
7107
7403
  }
7108
7404
 
7109
- Qcur = ggml_rope_custom(
7110
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7405
+ Qcur = ggml_rope_ext(
7406
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7111
7407
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7112
7408
  ext_factor, attn_factor, beta_fast, beta_slow
7113
7409
  );
7114
7410
  cb(Qcur, "Qcur", il);
7115
7411
 
7116
- Kcur = ggml_rope_custom(
7117
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7412
+ Kcur = ggml_rope_ext(
7413
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7118
7414
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7119
7415
  ext_factor, attn_factor, beta_fast, beta_slow
7120
7416
  );
@@ -7144,9 +7440,9 @@ struct llm_build_context {
7144
7440
  cb(cur, "ffn_norm", il);
7145
7441
 
7146
7442
  cur = llm_build_ffn(ctx0, cur,
7147
- model.layers[il].ffn_up, NULL,
7148
- model.layers[il].ffn_gate, NULL,
7149
- model.layers[il].ffn_down, NULL,
7443
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7444
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7445
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7150
7446
  NULL,
7151
7447
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7152
7448
  cb(cur, "ffn_out", il);
@@ -7164,6 +7460,7 @@ struct llm_build_context {
7164
7460
  model.layers[il].ffn_down_exps,
7165
7461
  n_expert, n_expert_used,
7166
7462
  LLM_FFN_SILU, true,
7463
+ false, 0.0,
7167
7464
  cb, il);
7168
7465
  cb(cur, "ffn_moe_out", il);
7169
7466
  }
@@ -7236,13 +7533,13 @@ struct llm_build_context {
7236
7533
 
7237
7534
  switch (model.type) {
7238
7535
  case MODEL_7B:
7239
- Qcur = ggml_rope_custom(
7240
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7536
+ Qcur = ggml_rope_ext(
7537
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7241
7538
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7242
7539
  ext_factor, attn_factor, beta_fast, beta_slow
7243
7540
  );
7244
- Kcur = ggml_rope_custom(
7245
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7541
+ Kcur = ggml_rope_ext(
7542
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7246
7543
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7544
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7545
  );
@@ -7348,15 +7645,15 @@ struct llm_build_context {
7348
7645
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7349
7646
  cb(Vcur, "Vcur", il);
7350
7647
 
7351
- Qcur = ggml_rope_custom(
7352
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7648
+ Qcur = ggml_rope_ext(
7649
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7353
7650
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7354
7651
  ext_factor, attn_factor, beta_fast, beta_slow
7355
7652
  );
7356
7653
  cb(Qcur, "Qcur", il);
7357
7654
 
7358
- Kcur = ggml_rope_custom(
7359
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7655
+ Kcur = ggml_rope_ext(
7656
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7360
7657
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7361
7658
  ext_factor, attn_factor, beta_fast, beta_slow
7362
7659
  );
@@ -7469,14 +7766,14 @@ struct llm_build_context {
7469
7766
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7470
7767
 
7471
7768
  // using mode = 2 for neox mode
7472
- Qcur = ggml_rope_custom(
7473
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7769
+ Qcur = ggml_rope_ext(
7770
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7474
7771
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7475
7772
  );
7476
7773
  cb(Qcur, "Qcur", il);
7477
7774
 
7478
- Kcur = ggml_rope_custom(
7479
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7775
+ Kcur = ggml_rope_ext(
7776
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7480
7777
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7481
7778
  );
7482
7779
  cb(Kcur, "Kcur", il);
@@ -7592,15 +7889,15 @@ struct llm_build_context {
7592
7889
  cb(Vcur, "Vcur", il);
7593
7890
  }
7594
7891
 
7595
- Qcur = ggml_rope_custom(
7596
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7892
+ Qcur = ggml_rope_ext(
7893
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7597
7894
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7598
7895
  ext_factor, attn_factor, beta_fast, beta_slow
7599
7896
  );
7600
7897
  cb(Qcur, "Qcur", il);
7601
7898
 
7602
- Kcur = ggml_rope_custom(
7603
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7899
+ Kcur = ggml_rope_ext(
7900
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7604
7901
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
7902
  ext_factor, attn_factor, beta_fast, beta_slow
7606
7903
  );
@@ -7645,6 +7942,7 @@ struct llm_build_context {
7645
7942
  model.layers[il].ffn_down_exps,
7646
7943
  n_expert, n_expert_used,
7647
7944
  LLM_FFN_GELU, true,
7945
+ false, 0.0,
7648
7946
  cb, il);
7649
7947
  cb(cur, "ffn_moe_out", il);
7650
7948
 
@@ -7744,15 +8042,15 @@ struct llm_build_context {
7744
8042
  cb(Kcur, "Kcur", il);
7745
8043
  cb(Vcur, "Vcur", il);
7746
8044
 
7747
- Qcur = ggml_rope_custom(
7748
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8045
+ Qcur = ggml_rope_ext(
8046
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7749
8047
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7750
8048
  ext_factor, attn_factor, beta_fast, beta_slow
7751
8049
  );
7752
8050
  cb(Qcur, "Qcur", il);
7753
8051
 
7754
- Kcur = ggml_rope_custom(
7755
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8052
+ Kcur = ggml_rope_ext(
8053
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7756
8054
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7757
8055
  ext_factor, attn_factor, beta_fast, beta_slow
7758
8056
  );
@@ -7788,6 +8086,7 @@ struct llm_build_context {
7788
8086
  model.layers[il].ffn_down_exps,
7789
8087
  n_expert, n_expert_used,
7790
8088
  LLM_FFN_SILU, true,
8089
+ false, 0.0,
7791
8090
  cb, il);
7792
8091
  cb(cur, "ffn_moe_out", il);
7793
8092
 
@@ -7921,213 +8220,6 @@ struct llm_build_context {
7921
8220
  return gf;
7922
8221
  }
7923
8222
 
7924
- struct ggml_cgraph * build_persimmon() {
7925
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7926
-
7927
- const int64_t n_embd_head = hparams.n_embd_head_v;
7928
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7929
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
7930
-
7931
- struct ggml_tensor * cur;
7932
- struct ggml_tensor * inpL;
7933
-
7934
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7935
-
7936
- // inp_pos - contains the positions
7937
- struct ggml_tensor * inp_pos = build_inp_pos();
7938
-
7939
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7940
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7941
-
7942
- for (int il = 0; il < n_layer; ++il) {
7943
- struct ggml_tensor * residual = inpL;
7944
-
7945
- cur = llm_build_norm(ctx0, inpL, hparams,
7946
- model.layers[il].attn_norm,
7947
- model.layers[il].attn_norm_b,
7948
- LLM_NORM, cb, il);
7949
- cb(cur, "attn_norm", il);
7950
-
7951
- // self attention
7952
- {
7953
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7954
- cb(cur, "wqkv", il);
7955
-
7956
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7957
- cb(cur, "bqkv", il);
7958
-
7959
- // split qkv
7960
- GGML_ASSERT(n_head_kv == n_head);
7961
-
7962
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7963
- cb(tmpqkv, "tmpqkv", il);
7964
-
7965
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7966
- cb(tmpqkv_perm, "tmpqkv", il);
7967
-
7968
- struct ggml_tensor * tmpq = ggml_view_3d(
7969
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7970
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7971
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7972
- 0
7973
- );
7974
- cb(tmpq, "tmpq", il);
7975
-
7976
- struct ggml_tensor * tmpk = ggml_view_3d(
7977
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7978
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7979
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7980
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7981
- );
7982
- cb(tmpk, "tmpk", il);
7983
-
7984
- // Q/K Layernorm
7985
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7986
- model.layers[il].attn_q_norm,
7987
- model.layers[il].attn_q_norm_b,
7988
- LLM_NORM, cb, il);
7989
- cb(tmpq, "tmpq", il);
7990
-
7991
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7992
- model.layers[il].attn_k_norm,
7993
- model.layers[il].attn_k_norm_b,
7994
- LLM_NORM, cb, il);
7995
- cb(tmpk, "tmpk", il);
7996
-
7997
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7998
- struct ggml_tensor * qrot = ggml_view_3d(
7999
- ctx0, tmpq, n_rot, n_head, n_tokens,
8000
- ggml_element_size(tmpq) * n_embd_head,
8001
- ggml_element_size(tmpq) * n_embd_head * n_head,
8002
- 0
8003
- );
8004
- cb(qrot, "qrot", il);
8005
-
8006
- struct ggml_tensor * krot = ggml_view_3d(
8007
- ctx0, tmpk, n_rot, n_head, n_tokens,
8008
- ggml_element_size(tmpk) * n_embd_head,
8009
- ggml_element_size(tmpk) * n_embd_head * n_head,
8010
- 0
8011
- );
8012
- cb(krot, "krot", il);
8013
-
8014
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
8015
- struct ggml_tensor * qpass = ggml_view_3d(
8016
- ctx0, tmpq, n_rot, n_head, n_tokens,
8017
- ggml_element_size(tmpq) * n_embd_head,
8018
- ggml_element_size(tmpq) * n_embd_head * n_head,
8019
- ggml_element_size(tmpq) * n_rot
8020
- );
8021
- cb(qpass, "qpass", il);
8022
-
8023
- struct ggml_tensor * kpass = ggml_view_3d(
8024
- ctx0, tmpk, n_rot, n_head, n_tokens,
8025
- ggml_element_size(tmpk) * n_embd_head,
8026
- ggml_element_size(tmpk) * n_embd_head * n_head,
8027
- ggml_element_size(tmpk) * n_rot
8028
- );
8029
- cb(kpass, "kpass", il);
8030
-
8031
- struct ggml_tensor * qrotated = ggml_rope_custom(
8032
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8033
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8034
- );
8035
- cb(qrotated, "qrotated", il);
8036
-
8037
- struct ggml_tensor * krotated = ggml_rope_custom(
8038
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8039
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8040
- );
8041
- cb(krotated, "krotated", il);
8042
-
8043
- // ggml currently only supports concatenation on dim=2
8044
- // so we need to permute qrot, qpass, concat, then permute back.
8045
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
8046
- cb(qrotated, "qrotated", il);
8047
-
8048
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
8049
- cb(krotated, "krotated", il);
8050
-
8051
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
8052
- cb(qpass, "qpass", il);
8053
-
8054
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
8055
- cb(kpass, "kpass", il);
8056
-
8057
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
8058
- cb(Qcur, "Qcur", il);
8059
-
8060
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
8061
- cb(Kcur, "Kcur", il);
8062
-
8063
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
8064
- cb(Q, "Q", il);
8065
-
8066
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8067
- cb(Kcur, "Kcur", il);
8068
-
8069
- struct ggml_tensor * Vcur = ggml_view_3d(
8070
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8071
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8072
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8073
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8074
- );
8075
- cb(Vcur, "Vcur", il);
8076
-
8077
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8078
- model.layers[il].wo, model.layers[il].bo,
8079
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8080
- }
8081
-
8082
- if (il == n_layer - 1) {
8083
- // skip computing output for unused tokens
8084
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8085
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8086
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8087
- }
8088
-
8089
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8090
- cb(ffn_inp, "ffn_inp", il);
8091
-
8092
- // feed-forward network
8093
- {
8094
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8095
- model.layers[il].ffn_norm,
8096
- model.layers[il].ffn_norm_b,
8097
- LLM_NORM, cb, il);
8098
- cb(cur, "ffn_norm", il);
8099
-
8100
- cur = llm_build_ffn(ctx0, cur,
8101
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8102
- NULL, NULL,
8103
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8104
- NULL,
8105
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8106
- cb(cur, "ffn_out", il);
8107
- }
8108
-
8109
- cur = ggml_add(ctx0, cur, ffn_inp);
8110
- cb(cur, "l_out", il);
8111
-
8112
- inpL = cur;
8113
- }
8114
-
8115
- cur = inpL;
8116
-
8117
- cur = llm_build_norm(ctx0, cur, hparams,
8118
- model.output_norm,
8119
- model.output_norm_b,
8120
- LLM_NORM, cb, -1);
8121
- cb(cur, "result_norm", -1);
8122
-
8123
- cur = ggml_mul_mat(ctx0, model.output, cur);
8124
- cb(cur, "result_output", -1);
8125
-
8126
- ggml_build_forward_expand(gf, cur);
8127
-
8128
- return gf;
8129
- }
8130
-
8131
8223
  struct ggml_cgraph * build_refact() {
8132
8224
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8133
8225
 
@@ -8304,15 +8396,15 @@ struct llm_build_context {
8304
8396
  cb(Kcur, "Kcur", il);
8305
8397
  cb(Vcur, "Vcur", il);
8306
8398
 
8307
- Qcur = ggml_rope_custom(
8308
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8399
+ Qcur = ggml_rope_ext(
8400
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8309
8401
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8310
8402
  ext_factor, attn_factor, beta_fast, beta_slow
8311
8403
  );
8312
8404
  cb(Qcur, "Qcur", il);
8313
8405
 
8314
- Kcur = ggml_rope_custom(
8315
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8406
+ Kcur = ggml_rope_ext(
8407
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8316
8408
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8317
8409
  ext_factor, attn_factor, beta_fast, beta_slow
8318
8410
  );
@@ -8744,15 +8836,15 @@ struct llm_build_context {
8744
8836
  }
8745
8837
 
8746
8838
 
8747
- Qcur = ggml_rope_custom(
8748
- ctx0, Qcur, inp_pos,
8839
+ Qcur = ggml_rope_ext(
8840
+ ctx0, Qcur, inp_pos, nullptr,
8749
8841
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8750
8842
  ext_factor, attn_factor, beta_fast, beta_slow
8751
8843
  );
8752
8844
  cb(Qcur, "Qcur", il);
8753
8845
 
8754
- Kcur = ggml_rope_custom(
8755
- ctx0, Kcur, inp_pos,
8846
+ Kcur = ggml_rope_ext(
8847
+ ctx0, Kcur, inp_pos, nullptr,
8756
8848
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8757
8849
  ext_factor, attn_factor, beta_fast, beta_slow
8758
8850
  );
@@ -8864,14 +8956,14 @@ struct llm_build_context {
8864
8956
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8865
8957
 
8866
8958
  // using mode = 2 for neox mode
8867
- Qcur = ggml_rope_custom(
8868
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8959
+ Qcur = ggml_rope_ext(
8960
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8869
8961
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8870
8962
  );
8871
8963
  cb(Qcur, "Qcur", il);
8872
8964
 
8873
- Kcur = ggml_rope_custom(
8874
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8965
+ Kcur = ggml_rope_ext(
8966
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8875
8967
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8876
8968
  );
8877
8969
  cb(Kcur, "Kcur", il);
@@ -8975,15 +9067,15 @@ struct llm_build_context {
8975
9067
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8976
9068
  cb(Vcur, "Vcur", il);
8977
9069
 
8978
- Qcur = ggml_rope_custom(
8979
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9070
+ Qcur = ggml_rope_ext(
9071
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8980
9072
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8981
9073
  ext_factor, attn_factor, beta_fast, beta_slow
8982
9074
  );
8983
9075
  cb(Qcur, "Qcur", il);
8984
9076
 
8985
- Kcur = ggml_rope_custom(
8986
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9077
+ Kcur = ggml_rope_ext(
9078
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8987
9079
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8988
9080
  ext_factor, attn_factor, beta_fast, beta_slow
8989
9081
  );
@@ -9089,15 +9181,15 @@ struct llm_build_context {
9089
9181
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9090
9182
  cb(Vcur, "Vcur", il);
9091
9183
 
9092
- Qcur = ggml_rope_custom(
9093
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9184
+ Qcur = ggml_rope_ext(
9185
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9094
9186
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9095
9187
  ext_factor, attn_factor, beta_fast, beta_slow
9096
9188
  );
9097
9189
  cb(Qcur, "Qcur", il);
9098
9190
 
9099
- Kcur = ggml_rope_custom(
9100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9191
+ Kcur = ggml_rope_ext(
9192
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9101
9193
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9102
9194
  ext_factor, attn_factor, beta_fast, beta_slow
9103
9195
  );
@@ -9133,6 +9225,7 @@ struct llm_build_context {
9133
9225
  model.layers[il].ffn_down_exps,
9134
9226
  n_expert, n_expert_used,
9135
9227
  LLM_FFN_SILU, false,
9228
+ false, 0.0,
9136
9229
  cb, il);
9137
9230
  cb(cur, "ffn_moe_out", il);
9138
9231
 
@@ -9241,8 +9334,8 @@ struct llm_build_context {
9241
9334
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9242
9335
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9243
9336
 
9244
- Qcur = ggml_rope_custom(
9245
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9337
+ Qcur = ggml_rope_ext(
9338
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9246
9339
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9247
9340
  );
9248
9341
  cb(Qcur, "Qcur", il);
@@ -9252,8 +9345,8 @@ struct llm_build_context {
9252
9345
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9253
9346
  cb(Qcur, "Qcur", il);
9254
9347
 
9255
- Kcur = ggml_rope_custom(
9256
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9348
+ Kcur = ggml_rope_ext(
9349
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9257
9350
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9258
9351
  );
9259
9352
  cb(Kcur, "Kcur", il);
@@ -9329,6 +9422,9 @@ struct llm_build_context {
9329
9422
 
9330
9423
  // self-attention
9331
9424
  {
9425
+ // rope freq factors for 128k context
9426
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9427
+
9332
9428
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9333
9429
  model.layers[il].attn_norm,
9334
9430
  NULL,
@@ -9360,8 +9456,8 @@ struct llm_build_context {
9360
9456
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9361
9457
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9362
9458
 
9363
- Qcur = ggml_rope_custom(
9364
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9459
+ Qcur = ggml_rope_ext(
9460
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9365
9461
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9366
9462
  );
9367
9463
  cb(Qcur, "Qcur", il);
@@ -9369,8 +9465,8 @@ struct llm_build_context {
9369
9465
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9370
9466
  cb(Qcur, "Qcur", il);
9371
9467
 
9372
- Kcur = ggml_rope_custom(
9373
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9468
+ Kcur = ggml_rope_ext(
9469
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9374
9470
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9375
9471
  );
9376
9472
  cb(Kcur, "Kcur", il);
@@ -9476,14 +9572,14 @@ struct llm_build_context {
9476
9572
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9477
9573
  cb(Vcur, "Vcur", il);
9478
9574
 
9479
- Qcur = ggml_rope_custom(
9480
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9575
+ Qcur = ggml_rope_ext(
9576
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9481
9577
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9482
9578
  ext_factor, attn_factor, beta_fast, beta_slow);
9483
9579
  cb(Qcur, "Qcur", il);
9484
9580
 
9485
- Kcur = ggml_rope_custom(
9486
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9581
+ Kcur = ggml_rope_ext(
9582
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9487
9583
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9488
9584
  ext_factor, attn_factor, beta_fast, beta_slow);
9489
9585
  cb(Kcur, "Kcur", il);
@@ -9684,15 +9780,15 @@ struct llm_build_context {
9684
9780
  cb(tmpk, "tmpk", il);
9685
9781
  cb(Vcur, "Vcur", il);
9686
9782
 
9687
- struct ggml_tensor * Qcur = ggml_rope_custom(
9688
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9783
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9784
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9689
9785
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9690
9786
  ext_factor, attn_factor, beta_fast, beta_slow
9691
9787
  );
9692
9788
  cb(Qcur, "Qcur", il);
9693
9789
 
9694
- struct ggml_tensor * Kcur = ggml_rope_custom(
9695
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9790
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9791
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9696
9792
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9697
9793
  ext_factor, attn_factor, beta_fast, beta_slow
9698
9794
  );
@@ -9800,15 +9896,15 @@ struct llm_build_context {
9800
9896
  // cb(Vcur, "Vcur", il);
9801
9897
  // }
9802
9898
 
9803
- Qcur = ggml_rope_custom(
9804
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9899
+ Qcur = ggml_rope_ext(
9900
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9805
9901
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9806
9902
  ext_factor, attn_factor, beta_fast, beta_slow
9807
9903
  );
9808
9904
  cb(Qcur, "Qcur", il);
9809
9905
 
9810
- Kcur = ggml_rope_custom(
9811
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9906
+ Kcur = ggml_rope_ext(
9907
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9812
9908
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9813
9909
  ext_factor, attn_factor, beta_fast, beta_slow
9814
9910
  );
@@ -9917,15 +10013,15 @@ struct llm_build_context {
9917
10013
  cb(Vcur, "Vcur", il);
9918
10014
  }
9919
10015
 
9920
- Qcur = ggml_rope_custom(
9921
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10016
+ Qcur = ggml_rope_ext(
10017
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9922
10018
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9923
10019
  ext_factor, attn_factor, beta_fast, beta_slow
9924
10020
  );
9925
10021
  cb(Qcur, "Qcur", il);
9926
10022
 
9927
- Kcur = ggml_rope_custom(
9928
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10023
+ Kcur = ggml_rope_ext(
10024
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9929
10025
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9930
10026
  ext_factor, attn_factor, beta_fast, beta_slow
9931
10027
  );
@@ -10047,15 +10143,15 @@ struct llm_build_context {
10047
10143
  cb(Vcur, "Vcur", il);
10048
10144
  }
10049
10145
 
10050
- Qcur = ggml_rope_custom(
10051
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10146
+ Qcur = ggml_rope_ext(
10147
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10052
10148
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10053
10149
  ext_factor, attn_factor, beta_fast, beta_slow
10054
10150
  );
10055
10151
  cb(Qcur, "Qcur", il);
10056
10152
 
10057
- Kcur = ggml_rope_custom(
10058
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10153
+ Kcur = ggml_rope_ext(
10154
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10059
10155
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10060
10156
  ext_factor, attn_factor, beta_fast, beta_slow
10061
10157
  );
@@ -10167,8 +10263,8 @@ struct llm_build_context {
10167
10263
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10168
10264
  cb(Vcur, "Vcur", il);
10169
10265
 
10170
- Qcur = ggml_rope_custom(
10171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10266
+ Qcur = ggml_rope_ext(
10267
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10172
10268
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10173
10269
  ext_factor, attn_factor, beta_fast, beta_slow);
10174
10270
  cb(Qcur, "Qcur", il);
@@ -10176,8 +10272,8 @@ struct llm_build_context {
10176
10272
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10177
10273
  cb(Qcur, "Qcur_scaled", il);
10178
10274
 
10179
- Kcur = ggml_rope_custom(
10180
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10275
+ Kcur = ggml_rope_ext(
10276
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10181
10277
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10182
10278
  ext_factor, attn_factor, beta_fast, beta_slow);
10183
10279
  cb(Kcur, "Kcur", il);
@@ -10287,15 +10383,15 @@ struct llm_build_context {
10287
10383
  cb(Vcur, "Vcur", il);
10288
10384
  }
10289
10385
 
10290
- Qcur = ggml_rope_custom(
10291
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10386
+ Qcur = ggml_rope_ext(
10387
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10292
10388
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10293
10389
  ext_factor, attn_factor, beta_fast, beta_slow
10294
10390
  );
10295
10391
  cb(Qcur, "Qcur", il);
10296
10392
 
10297
- Kcur = ggml_rope_custom(
10298
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10393
+ Kcur = ggml_rope_ext(
10394
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10299
10395
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10300
10396
  ext_factor, attn_factor, beta_fast, beta_slow
10301
10397
  );
@@ -10577,15 +10673,15 @@ struct llm_build_context {
10577
10673
  cb(Kcur, "Kcur", il);
10578
10674
  }
10579
10675
 
10580
- Qcur = ggml_rope_custom(
10581
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10676
+ Qcur = ggml_rope_ext(
10677
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10582
10678
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10583
10679
  ext_factor, attn_factor, beta_fast, beta_slow
10584
10680
  );
10585
10681
  cb(Qcur, "Qcur", il);
10586
10682
 
10587
- Kcur = ggml_rope_custom(
10588
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10683
+ Kcur = ggml_rope_ext(
10684
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10589
10685
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10590
10686
  ext_factor, attn_factor, beta_fast, beta_slow
10591
10687
  );
@@ -10680,8 +10776,269 @@ struct llm_build_context {
10680
10776
 
10681
10777
  // norm
10682
10778
  cur = llm_build_norm(ctx0, inpL, hparams,
10683
- NULL, NULL,
10684
- LLM_NORM, cb, il);
10779
+ NULL, NULL,
10780
+ LLM_NORM, cb, il);
10781
+ cb(cur, "attn_norm", il);
10782
+
10783
+ // self-attention
10784
+ {
10785
+ // compute Q and K and RoPE them
10786
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10787
+ cb(Qcur, "Qcur", il);
10788
+ if (hparams.f_clamp_kqv > 0.0f) {
10789
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10790
+ cb(Qcur, "Qcur", il);
10791
+ }
10792
+
10793
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10794
+ cb(Kcur, "Kcur", il);
10795
+ if (hparams.f_clamp_kqv > 0.0f) {
10796
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10797
+ cb(Kcur, "Kcur", il);
10798
+ }
10799
+
10800
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10801
+ cb(Vcur, "Vcur", il);
10802
+ if (hparams.f_clamp_kqv > 0.0f) {
10803
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10804
+ cb(Vcur, "Vcur", il);
10805
+ }
10806
+
10807
+ Qcur = ggml_rope_ext(
10808
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10809
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10810
+ ext_factor, attn_factor, beta_fast, beta_slow
10811
+ );
10812
+ cb(Qcur, "Qcur", il);
10813
+
10814
+ Kcur = ggml_rope_ext(
10815
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10816
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10817
+ ext_factor, attn_factor, beta_fast, beta_slow
10818
+ );
10819
+ cb(Kcur, "Kcur", il);
10820
+
10821
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10822
+ model.layers[il].wo, nullptr,
10823
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10824
+ }
10825
+
10826
+ if (il == n_layer - 1) {
10827
+ // skip computing output for unused tokens
10828
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10829
+ n_tokens = n_outputs;
10830
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10831
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10832
+ }
10833
+
10834
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10835
+ cb(ffn_inp, "ffn_inp", il);
10836
+
10837
+ // feed-forward network
10838
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10839
+ NULL, NULL,
10840
+ LLM_NORM, cb, il);
10841
+ cb(cur, "ffn_norm", il);
10842
+
10843
+ cur = llm_build_ffn(ctx0, cur,
10844
+ model.layers[il].ffn_up, NULL,
10845
+ model.layers[il].ffn_gate, NULL,
10846
+ model.layers[il].ffn_down, NULL,
10847
+ NULL,
10848
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10849
+ cb(cur, "ffn_out", il);
10850
+
10851
+ cur = ggml_add(ctx0, cur, ffn_inp);
10852
+ cb(cur, "ffn_out", il);
10853
+
10854
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10855
+ if (layer_dir != nullptr) {
10856
+ cur = ggml_add(ctx0, cur, layer_dir);
10857
+ }
10858
+ cb(cur, "l_out", il);
10859
+
10860
+ // input for next layer
10861
+ inpL = cur;
10862
+ }
10863
+
10864
+ cur = inpL;
10865
+
10866
+ cur = llm_build_norm(ctx0, cur, hparams,
10867
+ NULL, NULL,
10868
+ LLM_NORM, cb, -1);
10869
+ cb(cur, "result_norm", -1);
10870
+
10871
+ // lm_head
10872
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10873
+ cb(cur, "result_output", -1);
10874
+
10875
+ ggml_build_forward_expand(gf, cur);
10876
+
10877
+ return gf;
10878
+ }
10879
+
10880
+ struct ggml_cgraph * build_gptneox() {
10881
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10882
+
10883
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10884
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10885
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10886
+
10887
+ struct ggml_tensor * cur;
10888
+ struct ggml_tensor * inpL;
10889
+
10890
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10891
+
10892
+ // inp_pos - contains the positions
10893
+ struct ggml_tensor * inp_pos = build_inp_pos();
10894
+
10895
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10896
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10897
+
10898
+ for (int il = 0; il < n_layer; ++il) {
10899
+ cur = llm_build_norm(ctx0, inpL, hparams,
10900
+ model.layers[il].attn_norm,
10901
+ model.layers[il].attn_norm_b,
10902
+ LLM_NORM, cb, il);
10903
+ cb(cur, "attn_norm", il);
10904
+
10905
+ // self-attention
10906
+ {
10907
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10908
+ cb(cur, "wqkv", il);
10909
+
10910
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10911
+ cb(cur, "bqkv", il);
10912
+
10913
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10914
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10915
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10916
+
10917
+ cb(Qcur, "Qcur", il);
10918
+ cb(Kcur, "Kcur", il);
10919
+ cb(Vcur, "Vcur", il);
10920
+
10921
+ Qcur = ggml_rope_ext(
10922
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10923
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10924
+ ext_factor, attn_factor, beta_fast, beta_slow
10925
+ );
10926
+ cb(Qcur, "Qcur", il);
10927
+
10928
+ Kcur = ggml_rope_ext(
10929
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10930
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10931
+ ext_factor, attn_factor, beta_fast, beta_slow
10932
+ );
10933
+ cb(Kcur, "Kcur", il);
10934
+
10935
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10936
+ model.layers[il].wo, model.layers[il].bo,
10937
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10938
+ }
10939
+
10940
+ if (il == n_layer - 1) {
10941
+ // skip computing output for unused tokens
10942
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10943
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10944
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10945
+ }
10946
+
10947
+ // ffn
10948
+ if (hparams.use_par_res) {
10949
+ // attention and ffn are computed in parallel
10950
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10951
+
10952
+ struct ggml_tensor * attn_out = cur;
10953
+
10954
+ cur = llm_build_norm(ctx0, inpL, hparams,
10955
+ model.layers[il].ffn_norm,
10956
+ model.layers[il].ffn_norm_b,
10957
+ LLM_NORM, cb, il);
10958
+ cb(cur, "ffn_norm", il);
10959
+
10960
+ cur = llm_build_ffn(ctx0, cur,
10961
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10962
+ NULL, NULL,
10963
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10964
+ NULL,
10965
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10966
+ cb(cur, "ffn_out", il);
10967
+
10968
+ cur = ggml_add(ctx0, cur, inpL);
10969
+ cb(cur, "ffn_out", il);
10970
+
10971
+ inpL = ggml_add(ctx0, cur, attn_out);
10972
+ cb(inpL, "l_out", il);
10973
+ } else {
10974
+ // attention and ffn are computed sequentially
10975
+ // x = x + attn(ln1(x))
10976
+ // x = x + ffn(ln2(x))
10977
+
10978
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10979
+ cb(ffn_inp, "ffn_inp", il);
10980
+
10981
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10982
+ model.layers[il].ffn_norm,
10983
+ model.layers[il].ffn_norm_b,
10984
+ LLM_NORM, cb, il);
10985
+ cb(cur, "ffn_norm", il);
10986
+
10987
+ cur = llm_build_ffn(ctx0, cur,
10988
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10989
+ NULL, NULL,
10990
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10991
+ NULL,
10992
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10993
+ cb(cur, "ffn_out", il);
10994
+
10995
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10996
+ cb(inpL, "l_out", il);
10997
+ }
10998
+ }
10999
+
11000
+ cur = llm_build_norm(ctx0, inpL, hparams,
11001
+ model.output_norm,
11002
+ model.output_norm_b,
11003
+ LLM_NORM, cb, -1);
11004
+ cb(cur, "result_norm", -1);
11005
+
11006
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11007
+ cb(cur, "result_output", -1);
11008
+
11009
+ ggml_build_forward_expand(gf, cur);
11010
+
11011
+ return gf;
11012
+ }
11013
+
11014
+ struct ggml_cgraph * build_arctic() {
11015
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11016
+
11017
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11018
+ int32_t n_tokens = this->n_tokens;
11019
+
11020
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11021
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11022
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
11023
+
11024
+ struct ggml_tensor * cur;
11025
+ struct ggml_tensor * inpL;
11026
+
11027
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11028
+
11029
+ // inp_pos - contains the positions
11030
+ struct ggml_tensor * inp_pos = build_inp_pos();
11031
+
11032
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11033
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11034
+
11035
+ for (int il = 0; il < n_layer; ++il) {
11036
+ struct ggml_tensor * inpSA = inpL;
11037
+
11038
+ // norm
11039
+ cur = llm_build_norm(ctx0, inpL, hparams,
11040
+ model.layers[il].attn_norm, NULL,
11041
+ LLM_NORM_RMS, cb, il);
10685
11042
  cb(cur, "attn_norm", il);
10686
11043
 
10687
11044
  // self-attention
@@ -10689,41 +11046,29 @@ struct llm_build_context {
10689
11046
  // compute Q and K and RoPE them
10690
11047
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10691
11048
  cb(Qcur, "Qcur", il);
10692
- if (hparams.f_clamp_kqv > 0.0f) {
10693
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10694
- cb(Qcur, "Qcur", il);
10695
- }
10696
11049
 
10697
11050
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10698
11051
  cb(Kcur, "Kcur", il);
10699
- if (hparams.f_clamp_kqv > 0.0f) {
10700
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10701
- cb(Kcur, "Kcur", il);
10702
- }
10703
11052
 
10704
11053
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10705
11054
  cb(Vcur, "Vcur", il);
10706
- if (hparams.f_clamp_kqv > 0.0f) {
10707
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10708
- cb(Vcur, "Vcur", il);
10709
- }
10710
11055
 
10711
- Qcur = ggml_rope_custom(
10712
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
11056
+ Qcur = ggml_rope_ext(
11057
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10713
11058
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10714
11059
  ext_factor, attn_factor, beta_fast, beta_slow
10715
11060
  );
10716
11061
  cb(Qcur, "Qcur", il);
10717
11062
 
10718
- Kcur = ggml_rope_custom(
10719
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
11063
+ Kcur = ggml_rope_ext(
11064
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10720
11065
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10721
11066
  ext_factor, attn_factor, beta_fast, beta_slow
10722
11067
  );
10723
11068
  cb(Kcur, "Kcur", il);
10724
11069
 
10725
11070
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10726
- model.layers[il].wo, nullptr,
11071
+ model.layers[il].wo, NULL,
10727
11072
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10728
11073
  }
10729
11074
 
@@ -10740,8 +11085,8 @@ struct llm_build_context {
10740
11085
 
10741
11086
  // feed-forward network
10742
11087
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
10743
- NULL, NULL,
10744
- LLM_NORM, cb, il);
11088
+ model.layers[il].ffn_norm, NULL,
11089
+ LLM_NORM_RMS, cb, il);
10745
11090
  cb(cur, "ffn_norm", il);
10746
11091
 
10747
11092
  cur = llm_build_ffn(ctx0, cur,
@@ -10752,7 +11097,27 @@ struct llm_build_context {
10752
11097
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10753
11098
  cb(cur, "ffn_out", il);
10754
11099
 
10755
- cur = ggml_add(ctx0, cur, ffn_inp);
11100
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
11101
+ cb(ffn_out, "ffn_out", il);
11102
+
11103
+ // MoE
11104
+ cur = llm_build_norm(ctx0, inpSA, hparams,
11105
+ model.layers[il].ffn_norm_exps, NULL,
11106
+ LLM_NORM_RMS, cb, il);
11107
+ cb(cur, "ffn_norm_exps", il);
11108
+
11109
+ cur = llm_build_moe_ffn(ctx0, cur,
11110
+ model.layers[il].ffn_gate_inp,
11111
+ model.layers[il].ffn_up_exps,
11112
+ model.layers[il].ffn_gate_exps,
11113
+ model.layers[il].ffn_down_exps,
11114
+ n_expert, n_expert_used,
11115
+ LLM_FFN_SILU, true,
11116
+ false, 0.0,
11117
+ cb, il);
11118
+ cb(cur, "ffn_moe_out", il);
11119
+
11120
+ cur = ggml_add(ctx0, cur, ffn_out);
10756
11121
  cb(cur, "ffn_out", il);
10757
11122
 
10758
11123
  ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
@@ -10768,8 +11133,240 @@ struct llm_build_context {
10768
11133
  cur = inpL;
10769
11134
 
10770
11135
  cur = llm_build_norm(ctx0, cur, hparams,
10771
- NULL, NULL,
10772
- LLM_NORM, cb, -1);
11136
+ model.output_norm, NULL,
11137
+ LLM_NORM_RMS, cb, -1);
11138
+ cb(cur, "result_norm", -1);
11139
+
11140
+ // lm_head
11141
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11142
+ cb(cur, "result_output", -1);
11143
+
11144
+ ggml_build_forward_expand(gf, cur);
11145
+
11146
+ return gf;
11147
+ }
11148
+
11149
+ struct ggml_cgraph * build_deepseek2() {
11150
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11151
+
11152
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11153
+ int32_t n_tokens = this->n_tokens;
11154
+
11155
+ bool is_lite = (hparams.n_layer == 27);
11156
+
11157
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
11158
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
11159
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
11160
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
11161
+ const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
11162
+
11163
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11164
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11165
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11166
+
11167
+ struct ggml_tensor * cur;
11168
+ struct ggml_tensor * inpL;
11169
+
11170
+ // {n_embd, n_tokens}
11171
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11172
+
11173
+ // inp_pos - contains the positions
11174
+ struct ggml_tensor * inp_pos = build_inp_pos();
11175
+
11176
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11177
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11178
+
11179
+ for (int il = 0; il < n_layer; ++il) {
11180
+ struct ggml_tensor * inpSA = inpL;
11181
+
11182
+ // norm
11183
+ cur = llm_build_norm(ctx0, inpL, hparams,
11184
+ model.layers[il].attn_norm, NULL,
11185
+ LLM_NORM_RMS, cb, il);
11186
+ cb(cur, "attn_norm", il);
11187
+
11188
+ // self_attention
11189
+ {
11190
+ struct ggml_tensor * q = NULL;
11191
+ if (!is_lite) {
11192
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
11193
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
11194
+ cb(q, "q", il);
11195
+
11196
+ q = llm_build_norm(ctx0, q, hparams,
11197
+ model.layers[il].attn_q_a_norm, NULL,
11198
+ LLM_NORM_RMS, cb, il);
11199
+ cb(q, "q", il);
11200
+
11201
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
11202
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
11203
+ cb(q, "q", il);
11204
+ } else {
11205
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11206
+ cb(q, "q", il);
11207
+ }
11208
+
11209
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11210
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11211
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11212
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11213
+ 0);
11214
+ cb(q_nope, "q_nope", il);
11215
+
11216
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11217
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11218
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11219
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11220
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11221
+ cb(q_pe, "q_pe", il);
11222
+
11223
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11224
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11225
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11226
+
11227
+ // split into {kv_lora_rank, n_tokens}
11228
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11229
+ kv_pe_compresseed->nb[1],
11230
+ 0);
11231
+ cb(kv_compressed, "kv_compressed", il);
11232
+
11233
+ // and {n_embd_head_qk_rope, n_tokens}
11234
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11235
+ kv_pe_compresseed->nb[1],
11236
+ kv_pe_compresseed->nb[1],
11237
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11238
+ cb(k_pe, "k_pe", il);
11239
+
11240
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11241
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11242
+ model.layers[il].attn_kv_a_norm, NULL,
11243
+ LLM_NORM_RMS, cb, il);
11244
+ cb(kv_compressed, "kv_compressed", il);
11245
+
11246
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11247
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11248
+ cb(kv, "kv", il);
11249
+
11250
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11251
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11252
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11253
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11254
+ 0);
11255
+ cb(k_nope, "k_nope", il);
11256
+
11257
+ // and {n_head * n_embd_head_v, n_tokens}
11258
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11259
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11260
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11261
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11262
+ cb(v_states, "v_states", il);
11263
+
11264
+ v_states = ggml_cont(ctx0, v_states);
11265
+ cb(v_states, "v_states", il);
11266
+
11267
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11268
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11269
+ 0);
11270
+ cb(v_states, "v_states", il);
11271
+
11272
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
+ q_pe = ggml_rope_ext(
11274
+ ctx0, q_pe, inp_pos, nullptr,
11275
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11276
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
+ );
11278
+ cb(q_pe, "q_pe", il);
11279
+
11280
+ // shared RoPE key
11281
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
+ k_pe = ggml_rope_ext(
11283
+ ctx0, k_pe, inp_pos, nullptr,
11284
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11285
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
+ );
11287
+ cb(k_pe, "k_pe", il);
11288
+
11289
+ struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11290
+ cb(q_states, "q_states", il);
11291
+
11292
+ struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11293
+ cb(k_states, "k_states", il);
11294
+
11295
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11296
+ model.layers[il].wo, NULL,
11297
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11298
+ }
11299
+
11300
+ if (il == n_layer - 1) {
11301
+ // skip computing output for unused tokens
11302
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11303
+ n_tokens = n_outputs;
11304
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11305
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11306
+ }
11307
+
11308
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11309
+ cb(ffn_inp, "ffn_inp", il);
11310
+
11311
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
11312
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11313
+ model.layers[il].ffn_norm, NULL,
11314
+ LLM_NORM_RMS, cb, il);
11315
+ cb(cur, "ffn_norm", il);
11316
+
11317
+ cur = llm_build_ffn(ctx0, cur,
11318
+ model.layers[il].ffn_up, NULL,
11319
+ model.layers[il].ffn_gate, NULL,
11320
+ model.layers[il].ffn_down, NULL,
11321
+ NULL,
11322
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11323
+ cb(cur, "ffn_out", il);
11324
+ } else {
11325
+ // MoE branch
11326
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11327
+ model.layers[il].ffn_norm, NULL,
11328
+ LLM_NORM_RMS, cb, il);
11329
+ cb(cur, "ffn_norm", il);
11330
+
11331
+ ggml_tensor * moe_out =
11332
+ llm_build_moe_ffn(ctx0, cur,
11333
+ model.layers[il].ffn_gate_inp,
11334
+ model.layers[il].ffn_up_exps,
11335
+ model.layers[il].ffn_gate_exps,
11336
+ model.layers[il].ffn_down_exps,
11337
+ n_expert, n_expert_used,
11338
+ LLM_FFN_SILU, false,
11339
+ true, hparams.expert_weights_scale,
11340
+ cb, il);
11341
+ cb(moe_out, "ffn_moe_out", il);
11342
+
11343
+ // FFN shared expert
11344
+ {
11345
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
11346
+ model.layers[il].ffn_up_shexp, NULL,
11347
+ model.layers[il].ffn_gate_shexp, NULL,
11348
+ model.layers[il].ffn_down_shexp, NULL,
11349
+ NULL,
11350
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11351
+ cb(ffn_shexp, "ffn_shexp", il);
11352
+
11353
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
11354
+ cb(cur, "ffn_out", il);
11355
+ }
11356
+ }
11357
+
11358
+ cur = ggml_add(ctx0, cur, ffn_inp);
11359
+ cb(cur, "l_out", il);
11360
+
11361
+ // input for next layer
11362
+ inpL = cur;
11363
+ }
11364
+
11365
+ cur = inpL;
11366
+
11367
+ cur = llm_build_norm(ctx0, cur, hparams,
11368
+ model.output_norm, NULL,
11369
+ LLM_NORM_RMS, cb, -1);
10773
11370
  cb(cur, "result_norm", -1);
10774
11371
 
10775
11372
  // lm_head
@@ -10780,6 +11377,7 @@ struct llm_build_context {
10780
11377
 
10781
11378
  return gf;
10782
11379
  }
11380
+
10783
11381
  };
10784
11382
 
10785
11383
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -10896,10 +11494,6 @@ static struct ggml_cgraph * llama_build_graph(
10896
11494
  {
10897
11495
  result = llm.build_starcoder();
10898
11496
  } break;
10899
- case LLM_ARCH_PERSIMMON:
10900
- {
10901
- result = llm.build_persimmon();
10902
- } break;
10903
11497
  case LLM_ARCH_REFACT:
10904
11498
  {
10905
11499
  result = llm.build_refact();
@@ -10994,6 +11588,18 @@ static struct ggml_cgraph * llama_build_graph(
10994
11588
  {
10995
11589
  result = llm.build_olmo();
10996
11590
  } break;
11591
+ case LLM_ARCH_GPTNEOX:
11592
+ {
11593
+ result = llm.build_gptneox();
11594
+ } break;
11595
+ case LLM_ARCH_ARCTIC:
11596
+ {
11597
+ result = llm.build_arctic();
11598
+ } break;
11599
+ case LLM_ARCH_DEEPSEEK2:
11600
+ {
11601
+ result = llm.build_deepseek2();
11602
+ } break;
10997
11603
  default:
10998
11604
  GGML_ASSERT(false);
10999
11605
  }
@@ -11339,11 +11945,6 @@ static void llama_graph_compute(
11339
11945
  llama_context & lctx,
11340
11946
  ggml_cgraph * gf,
11341
11947
  int n_threads) {
11342
- #ifdef GGML_USE_MPI
11343
- const int64_t n_layer = lctx.model.hparams.n_layer;
11344
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11345
- #endif
11346
-
11347
11948
  #ifdef GGML_USE_METAL
11348
11949
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11349
11950
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11358,10 +11959,6 @@ static void llama_graph_compute(
11358
11959
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11359
11960
 
11360
11961
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11361
-
11362
- #ifdef GGML_USE_MPI
11363
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11364
- #endif
11365
11962
  }
11366
11963
 
11367
11964
  // decode a batch of tokens by evaluating the transformer
@@ -11399,12 +11996,6 @@ static int llama_decode_internal(
11399
11996
  }
11400
11997
  lctx.n_queued_tokens += n_tokens_all;
11401
11998
 
11402
- #ifdef GGML_USE_MPI
11403
- // TODO: needs fix after #3228
11404
- GGML_ASSERT(false && "not implemented");
11405
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11406
- #endif
11407
-
11408
11999
  auto & kv_self = lctx.kv_self;
11409
12000
 
11410
12001
  const int64_t n_embd = hparams.n_embd;
@@ -12298,6 +12889,7 @@ struct llm_tokenizer_bpe {
12298
12889
  });
12299
12890
  break;
12300
12891
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12892
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12301
12893
  word_collection = unicode_regex_split(text, {
12302
12894
  // same as llama3
12303
12895
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12354,6 +12946,7 @@ struct llm_tokenizer_bpe {
12354
12946
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12355
12947
  });
12356
12948
  break;
12949
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12357
12950
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12358
12951
  word_collection = unicode_regex_split(text, {
12359
12952
  // original regex from tokenizer.json
@@ -12519,7 +13112,7 @@ struct llm_tokenizer_wpm {
12519
13112
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
12520
13113
 
12521
13114
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12522
- auto * token_map = &vocab.token_to_id;
13115
+ const auto & token_map = vocab.token_to_id;
12523
13116
 
12524
13117
  // normalize and split by whitespace
12525
13118
  std::vector<std::string> words = preprocess(text);
@@ -12534,108 +13127,89 @@ struct llm_tokenizer_wpm {
12534
13127
  }
12535
13128
 
12536
13129
  // prepend phantom space
12537
- std::string word1 = "\xe2\x96\x81" + word;
12538
- int n = word1.size();
13130
+ const std::string word1 = "\xe2\x96\x81" + word;
13131
+ const int n = word1.size();
12539
13132
 
12540
- // we're at the start of a new word
12541
- int i = 0;
12542
- bool match_any = false;
13133
+ const size_t current_tokens = output.size();
12543
13134
 
13135
+ // we're at the start of a new word
12544
13136
  // move through character position in word
12545
- while (i < n) {
13137
+ for (int i = 0; i < n; ++i) {
12546
13138
  // loop through possible match length
12547
13139
  bool match = false;
12548
13140
  for (int j = n; j > i; j--) {
12549
- auto it = token_map->find(word1.substr(i, j - i));
12550
- if (it != token_map->end()) {
13141
+ auto it = token_map.find(word1.substr(i, j - i));
13142
+ if (it != token_map.end()) {
12551
13143
  output.push_back(it->second);
12552
13144
  match = true;
12553
- match_any = true;
12554
- i = j;
13145
+ i = j - 1;
12555
13146
  break;
12556
13147
  }
12557
13148
  }
12558
13149
 
12559
- // must be an unknown character
12560
- if (!match) {
12561
- i++;
13150
+ if (!match) { // discard all
13151
+ output.resize(current_tokens);
13152
+ break; // and discard next tokens
12562
13153
  }
12563
13154
  }
12564
13155
 
12565
13156
  // we didn't find any matches for this word
12566
- if (!match_any) {
13157
+ if (current_tokens == output.size()) {
12567
13158
  output.push_back(vocab.special_unk_id);
12568
13159
  }
12569
13160
  }
12570
13161
  }
12571
13162
 
12572
13163
  std::vector<std::string> preprocess(const std::string & text) {
12573
- std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
12574
-
12575
- // strip accents, strip control, uniformize whitespace,
12576
- // to lowercase, pad chinese characters, pad punctuation
12577
- std::string new_str = "";
12578
- for (uint32_t code : cpts_nfd) {
12579
- const codepoint_flags flags = unicode_cpt_flags(code);
12580
- if (flags.is_accent_mark || flags.is_control) {
13164
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13165
+ std::vector<std::string> words(1, "");
13166
+
13167
+ for (const char32_t cpt : cpts_nfd) {
13168
+ const auto flags = unicode_cpt_flags(cpt);
13169
+
13170
+ if (flags.is_whitespace) {
13171
+ if (words.back().size()) { // finish previous word if any
13172
+ words.emplace_back();
13173
+ }
12581
13174
  continue;
12582
13175
  }
12583
- code = unicode_tolower(code);
12584
- if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12585
- code = ' ';
12586
- }
12587
- std::string s = unicode_cpt_to_utf8(code);
12588
- if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12589
- new_str += " ";
12590
- new_str += s;
12591
- new_str += " ";
12592
- } else {
12593
- new_str += s;
13176
+
13177
+ assert (!flags.is_separator);
13178
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
13179
+ continue;
12594
13180
  }
12595
- }
12596
13181
 
12597
- // split by whitespace
12598
- uint64_t l = 0;
12599
- uint64_t r = 0;
12600
- std::vector<std::string> words;
12601
- while (r < new_str.size()) {
12602
- // if is whitespace
12603
- if (isspace(new_str[r], std::locale::classic())) {
12604
- if (r > l) words.push_back(new_str.substr(l, (r - l)));
12605
- l = r + 1;
12606
- r = l;
13182
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
13183
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
13184
+ if (words.back().size()) { // finish previous word if any
13185
+ words.emplace_back();
13186
+ }
13187
+ words.back() = s; // single char word
13188
+ words.emplace_back(); // start a new word
12607
13189
  } else {
12608
- r += 1;
13190
+ words.back() += s; // append char to word
12609
13191
  }
12610
13192
  }
12611
- if (r > l) {
12612
- words.push_back(new_str.substr(l, (r - l)));
12613
- }
12614
- return words;
12615
- }
12616
13193
 
12617
- bool is_ascii_punct(uint32_t code) {
12618
- if (code > 0xFF) {
12619
- return false;
13194
+ if (!words.back().size()) {
13195
+ words.pop_back();
12620
13196
  }
12621
- auto c = char(static_cast<unsigned char>(code));
12622
- return ispunct(c, std::locale::classic());
13197
+
13198
+ return words;
12623
13199
  }
12624
13200
 
12625
- bool is_chinese_char(uint32_t cpt) {
12626
- if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
12627
- (cpt >= 0x3400 && cpt <= 0x4DBF) ||
13201
+ static bool is_chinese_char(uint32_t cpt) {
13202
+ return
13203
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
13204
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
12628
13205
  (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
12629
13206
  (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
12630
13207
  (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
12631
13208
  (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
12632
- (cpt >= 0xF900 && cpt <= 0xFAFF) ||
12633
- (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
12634
- (cpt >= 0x3000 && cpt <= 0x303F) ||
12635
- (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
12636
- return true; // NOLINT
12637
- }
12638
- return false;
13209
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
13210
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
13211
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
13212
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
12639
13213
  }
12640
13214
 
12641
13215
  const llama_vocab & vocab;
@@ -12679,9 +13253,8 @@ struct fragment_buffer_variant {
12679
13253
 
12680
13254
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
12681
13255
  // for each special token
12682
- for (const auto & st: vocab.special_tokens_cache) {
12683
- const auto & special_token = st.first;
12684
- const auto & special_id = st.second;
13256
+ for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
+ const auto & special_token = vocab.id_to_token[special_id].text;
12685
13258
 
12686
13259
  // for each text fragment
12687
13260
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -12690,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12690
13263
 
12691
13264
  // if a fragment is text ( not yet processed )
12692
13265
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12693
- auto * raw_text = &(fragment.raw_text);
13266
+ auto & raw_text = fragment.raw_text;
12694
13267
 
12695
13268
  auto raw_text_base_offset = fragment.offset;
12696
13269
  auto raw_text_base_length = fragment.length;
@@ -12700,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12700
13273
  // find the first occurrence of a given special token in this fragment
12701
13274
  // passing offset argument only limit the "search area" but match coordinates
12702
13275
  // are still relative to the source full raw_text
12703
- auto match = raw_text->find(special_token, raw_text_base_offset);
13276
+ auto match = raw_text.find(special_token, raw_text_base_offset);
12704
13277
 
12705
13278
  // no occurrences found, stop processing this fragment for a given special token
12706
13279
  if (match == std::string::npos) break;
@@ -12719,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12719
13292
  // left
12720
13293
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
12721
13294
  const int64_t left_reminder_length = match - raw_text_base_offset;
12722
- buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
13295
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
12723
13296
 
12724
13297
  #ifdef PRETOKENIZERDEBUG
12725
13298
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
@@ -12735,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12735
13308
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
12736
13309
  const int64_t right_reminder_offset = match + special_token.length();
12737
13310
  const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
12738
- buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
13311
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
12739
13312
 
12740
13313
  #ifdef PRETOKENIZERDEBUG
12741
13314
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
@@ -12788,9 +13361,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12788
13361
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12789
13362
  // tokenizer.encode('', add_special_tokens=False) returns []
12790
13363
 
13364
+ static const bool rtrim = true; //TODO: as param
13365
+ bool is_prev_special = false;
13366
+ bool special_token_rtrim = false;
13367
+
12791
13368
  if (add_special && vocab.special_add_bos != 0) {
12792
13369
  GGML_ASSERT(vocab.special_bos_id != -1);
12793
13370
  output.push_back(vocab.special_bos_id);
13371
+ is_prev_special = true;
12794
13372
  }
12795
13373
 
12796
13374
  for (const auto & fragment : fragment_buffer) {
@@ -12802,9 +13380,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12802
13380
  // and passing 'add space prefix' as bool argument
12803
13381
  //
12804
13382
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12805
- if (&fragment == &fragment_buffer.front()) {
12806
- if (vocab.add_space_prefix) {
12807
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13383
+
13384
+ if (special_token_rtrim) {
13385
+ size_t num_whitespaces = 0;
13386
+ while (isspace(raw_text[num_whitespaces])) {
13387
+ num_whitespaces++;
13388
+ }
13389
+ if (num_whitespaces == raw_text.size()) {
13390
+ continue; // skip if all whitespaces
13391
+ }
13392
+ raw_text = raw_text.substr(num_whitespaces);
13393
+ }
13394
+
13395
+ if (vocab.add_space_prefix) {
13396
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13397
+ raw_text = " " + raw_text;
12808
13398
  }
12809
13399
  }
12810
13400
 
@@ -12816,6 +13406,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12816
13406
  tokenizer.tokenize(raw_text, output);
12817
13407
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12818
13408
  output.push_back(fragment.token);
13409
+ is_prev_special = true;
13410
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13411
+ special_token_rtrim = rtrim
13412
+ && fragment.token != vocab.special_bos_id
13413
+ && fragment.token != vocab.special_unk_id
13414
+ && fragment.token != vocab.special_eos_id;
12819
13415
  }
12820
13416
  }
12821
13417
 
@@ -13816,7 +14412,7 @@ void llama_sample_repetition_penalties(
13816
14412
 
13817
14413
  void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
13818
14414
  GGML_ASSERT(ctx);
13819
- const int64_t t_start_sample_us = ggml_time_us();
14415
+ int64_t t_start_sample_us = ggml_time_us();
13820
14416
 
13821
14417
  bool allow_eog = false;
13822
14418
  for (const auto & stack : grammar->stacks) {
@@ -13828,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13828
14424
 
13829
14425
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
13830
14426
  candidates_decoded.reserve(candidates->size);
13831
- std::vector<llama_grammar_candidate> candidates_grammar;
14427
+
14428
+ std::vector<llama_grammar_candidate> candidates_grammar;
13832
14429
  candidates_grammar.reserve(candidates->size);
13833
14430
 
13834
14431
  for (size_t i = 0; i < candidates->size; ++i) {
13835
- const llama_token id = candidates->data[i].id;
13836
- const std::string piece = llama_token_to_piece(ctx, id, false);
14432
+ const llama_token id = candidates->data[i].id;
14433
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
13837
14434
 
13838
14435
  if (llama_token_is_eog(&ctx->model, id)) {
13839
14436
  if (!allow_eog) {
@@ -14033,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14033
14630
  GGML_ASSERT(false);
14034
14631
  }
14035
14632
 
14036
- const std::string piece = llama_token_to_piece(ctx, token, false);
14633
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
14037
14634
 
14038
14635
  // Note terminating 0 in decoded string
14039
14636
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14518,8 +15115,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14518
15115
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14519
15116
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14520
15117
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14521
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14522
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14523
15118
  if (qs.model.type == MODEL_70B) {
14524
15119
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14525
15120
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15533,10 +16128,6 @@ void llama_backend_init(void) {
15533
16128
  struct ggml_context * ctx = ggml_init(params);
15534
16129
  ggml_free(ctx);
15535
16130
  }
15536
-
15537
- #ifdef GGML_USE_MPI
15538
- ggml_mpi_backend_init();
15539
- #endif
15540
16131
  }
15541
16132
 
15542
16133
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15546,9 +16137,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15546
16137
  }
15547
16138
 
15548
16139
  void llama_backend_free(void) {
15549
- #ifdef GGML_USE_MPI
15550
- ggml_mpi_backend_free();
15551
- #endif
15552
16140
  ggml_quantize_free();
15553
16141
  }
15554
16142
 
@@ -15691,6 +16279,7 @@ struct llama_context * llama_new_context_with_model(
15691
16279
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15692
16280
  }
15693
16281
 
16282
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15694
16283
  cparams.causal_attn = hparams.causal_attn;
15695
16284
 
15696
16285
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15949,20 +16538,6 @@ struct llama_context * llama_new_context_with_model(
15949
16538
  }
15950
16539
  }
15951
16540
 
15952
- #ifdef GGML_USE_MPI
15953
- ctx->ctx_mpi = ggml_mpi_init();
15954
-
15955
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15956
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15957
- // TODO: needs fix after #3228
15958
- GGML_ASSERT(false && "not implemented");
15959
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15960
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15961
- llama_backend_free();
15962
- exit(1);
15963
- }
15964
- #endif
15965
-
15966
16541
  return ctx;
15967
16542
  }
15968
16543
 
@@ -15999,7 +16574,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15999
16574
  // these models do not use RoPE
16000
16575
  case LLM_ARCH_GPT2:
16001
16576
  case LLM_ARCH_GPTJ:
16002
- case LLM_ARCH_GPTNEOX:
16003
16577
  case LLM_ARCH_MPT:
16004
16578
  case LLM_ARCH_REFACT:
16005
16579
  case LLM_ARCH_BLOOM:
@@ -16019,13 +16593,14 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16019
16593
  case LLM_ARCH_XVERSE:
16020
16594
  case LLM_ARCH_COMMAND_R:
16021
16595
  case LLM_ARCH_OLMO:
16596
+ case LLM_ARCH_ARCTIC:
16597
+ case LLM_ARCH_DEEPSEEK2:
16022
16598
  return LLAMA_ROPE_TYPE_NORM;
16023
16599
 
16024
16600
  // the pairs of head values are offset by n_rot/2
16025
16601
  case LLM_ARCH_FALCON:
16026
16602
  case LLM_ARCH_GROK:
16027
16603
  case LLM_ARCH_DBRX:
16028
- case LLM_ARCH_PERSIMMON:
16029
16604
  case LLM_ARCH_BERT:
16030
16605
  case LLM_ARCH_NOMIC_BERT:
16031
16606
  case LLM_ARCH_STABLELM:
@@ -16036,6 +16611,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16036
16611
  case LLM_ARCH_PHI3:
16037
16612
  case LLM_ARCH_GEMMA:
16038
16613
  case LLM_ARCH_STARCODER2:
16614
+ case LLM_ARCH_GPTNEOX:
16039
16615
  return LLAMA_ROPE_TYPE_NEOX;
16040
16616
 
16041
16617
  // all model arches should be listed explicitly here
@@ -16195,6 +16771,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16195
16771
  }
16196
16772
 
16197
16773
  // make tensors
16774
+ cvec.tensors.reserve(model.hparams.n_layer);
16198
16775
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16199
16776
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16200
16777
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16203,6 +16780,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16203
16780
  }
16204
16781
 
16205
16782
  // allocate tensors / buffers and zero
16783
+ cvec.ctxs.reserve(ctx_map.size());
16784
+ cvec.bufs.reserve(ctx_map.size());
16206
16785
  for (auto it : ctx_map) {
16207
16786
  ggml_backend_buffer_type_t buft = it.first;
16208
16787
  ggml_context * ctx = it.second;
@@ -17411,6 +17990,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17411
17990
  ctx->cparams.n_threads_batch = n_threads_batch;
17412
17991
  }
17413
17992
 
17993
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17994
+ return ctx->cparams.n_threads;
17995
+ }
17996
+
17997
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17998
+ return ctx->cparams.n_threads_batch;
17999
+ }
18000
+
17414
18001
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17415
18002
  ctx->abort_callback = abort_callback;
17416
18003
  ctx->abort_callback_data = abort_callback_data;
@@ -17634,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17634
18221
  );
17635
18222
  }
17636
18223
 
18224
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
18225
+ return llama_is_control_token(model->vocab, token);
18226
+ }
18227
+
17637
18228
  llama_token llama_token_bos(const struct llama_model * model) {
17638
18229
  return model->vocab.special_bos_id;
17639
18230
  }
@@ -17705,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
17705
18296
 
17706
18297
  const auto cpts = unicode_cpts_from_utf8(text);
17707
18298
  for (const auto cpt : cpts) {
17708
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
18299
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
18300
+ try {
18301
+ decoded_text += unicode_utf8_to_byte(utf8);
18302
+ } catch (const std::out_of_range & e) {
18303
+ decoded_text += "[UNK_BYTE_0x";
18304
+ for (const auto c : utf8) {
18305
+ decoded_text += format("%02x", (uint8_t) c);
18306
+ }
18307
+ decoded_text += text + "]";
18308
+ }
17709
18309
  }
17710
18310
 
17711
18311
  return decoded_text;
@@ -17713,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
17713
18313
 
17714
18314
  // does not write null-terminator to buf
17715
18315
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18316
+ // if we have a cache - use it
18317
+ {
18318
+ const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18319
+
18320
+ if (!cache.empty()) {
18321
+ const auto & res = cache.at(token);
18322
+ if (length < (int) res.size()) {
18323
+ return -(int) res.size();
18324
+ }
18325
+ memcpy(buf, res.c_str(), res.size());
18326
+ return res.size();
18327
+ }
18328
+ }
18329
+
17716
18330
  if (0 <= token && token < llama_n_vocab(model)) {
17717
18331
  switch (llama_vocab_get_type(model->vocab)) {
17718
- case LLAMA_VOCAB_TYPE_WPM:
17719
- case LLAMA_VOCAB_TYPE_SPM: {
17720
- // NOTE: we accept all unsupported token types,
17721
- // suppressing them like CONTROL tokens.
17722
- if (llama_is_normal_token(model->vocab, token)) {
17723
- std::string result = model->vocab.id_to_token[token].text;
17724
- llama_unescape_whitespace(result);
17725
- if (length < (int) result.length()) {
17726
- return -(int) result.length();
17727
- }
17728
- memcpy(buf, result.c_str(), result.length());
17729
- return result.length();
17730
- } else if (
17731
- (llama_is_user_defined_token(model->vocab, token)) ||
17732
- (llama_is_control_token (model->vocab, token) && special)) {
17733
- std::string result = model->vocab.id_to_token[token].text;
17734
- if (length < (int) result.length()) {
17735
- return -(int) result.length();
17736
- }
17737
- memcpy(buf, result.c_str(), result.length());
17738
- return result.length();
17739
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
17740
- if (length < 3) {
17741
- return -3;
17742
- }
17743
- memcpy(buf, "\xe2\x96\x85", 3);
17744
- return 3;
17745
- } else if (llama_is_byte_token(model->vocab, token)) {
17746
- if (length < 1) {
17747
- return -1;
18332
+ case LLAMA_VOCAB_TYPE_WPM:
18333
+ case LLAMA_VOCAB_TYPE_SPM: {
18334
+ // NOTE: we accept all unsupported token types,
18335
+ // suppressing them like CONTROL tokens.
18336
+ if (llama_is_normal_token(model->vocab, token)) {
18337
+ std::string result = model->vocab.id_to_token[token].text;
18338
+ llama_unescape_whitespace(result);
18339
+ if (length < (int) result.length()) {
18340
+ return -(int) result.length();
18341
+ }
18342
+ memcpy(buf, result.c_str(), result.length());
18343
+ return result.length();
18344
+ } else if (
18345
+ (llama_is_user_defined_token(model->vocab, token)) ||
18346
+ (llama_is_control_token (model->vocab, token) && special)) {
18347
+ std::string result = model->vocab.id_to_token[token].text;
18348
+ if (length < (int) result.length()) {
18349
+ return -(int) result.length();
18350
+ }
18351
+ memcpy(buf, result.c_str(), result.length());
18352
+ return result.length();
18353
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18354
+ if (length < 3) {
18355
+ return -3;
18356
+ }
18357
+ memcpy(buf, "\xe2\x96\x85", 3);
18358
+ return 3;
18359
+ } else if (llama_is_byte_token(model->vocab, token)) {
18360
+ if (length < 1) {
18361
+ return -1;
18362
+ }
18363
+ buf[0] = llama_token_to_byte(model->vocab, token);
18364
+ return 1;
17748
18365
  }
17749
- buf[0] = llama_token_to_byte(model->vocab, token);
17750
- return 1;
18366
+ break;
17751
18367
  }
17752
- break;
17753
- }
17754
- case LLAMA_VOCAB_TYPE_BPE: {
17755
- // NOTE: we accept all unsupported token types,
17756
- // suppressing them like CONTROL tokens.
17757
- if (llama_is_normal_token(model->vocab, token)) {
17758
- std::string result = model->vocab.id_to_token[token].text;
17759
- result = llama_decode_text(result);
17760
- if (length < (int) result.length()) {
17761
- return -(int) result.length();
17762
- }
17763
- memcpy(buf, result.c_str(), result.length());
17764
- return result.length();
17765
- } else if (
17766
- (llama_is_user_defined_token(model->vocab, token)) ||
17767
- (llama_is_control_token (model->vocab, token) && special)) {
17768
- std::string result = model->vocab.id_to_token[token].text;
17769
- if (length < (int) result.length()) {
17770
- return -(int) result.length();
18368
+ case LLAMA_VOCAB_TYPE_BPE: {
18369
+ // NOTE: we accept all unsupported token types,
18370
+ // suppressing them like CONTROL tokens.
18371
+ if (llama_is_normal_token(model->vocab, token)) {
18372
+ std::string result = model->vocab.id_to_token[token].text;
18373
+ result = llama_decode_text(result);
18374
+ if (length < (int) result.length()) {
18375
+ return -(int) result.length();
18376
+ }
18377
+ memcpy(buf, result.c_str(), result.length());
18378
+ return result.length();
18379
+ } else if (
18380
+ (llama_is_user_defined_token(model->vocab, token)) ||
18381
+ (llama_is_control_token (model->vocab, token) && special)) {
18382
+ std::string result = model->vocab.id_to_token[token].text;
18383
+ if (length < (int) result.length()) {
18384
+ return -(int) result.length();
18385
+ }
18386
+ memcpy(buf, result.c_str(), result.length());
18387
+ return result.length();
17771
18388
  }
17772
- memcpy(buf, result.c_str(), result.length());
17773
- return result.length();
18389
+ break;
17774
18390
  }
17775
- break;
17776
- }
17777
- default:
17778
- GGML_ASSERT(false);
18391
+ default:
18392
+ GGML_ASSERT(false);
17779
18393
  }
17780
18394
  }
17781
18395
  return 0;
@@ -17845,6 +18459,15 @@ static int32_t llama_chat_apply_template_internal(
17845
18459
  }
17846
18460
  }
17847
18461
  // llama2 templates seem to not care about "add_generation_prompt"
18462
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18463
+ // Phi 3
18464
+ for (auto message : chat) {
18465
+ std::string role(message->role);
18466
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18467
+ }
18468
+ if (add_ass) {
18469
+ ss << "<|assistant|>\n";
18470
+ }
17848
18471
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17849
18472
  // zephyr template
17850
18473
  for (auto message : chat) {
@@ -17977,15 +18600,6 @@ static int32_t llama_chat_apply_template_internal(
17977
18600
  if (add_ass) {
17978
18601
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17979
18602
  }
17980
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17981
- // Phi 3
17982
- for (auto message : chat) {
17983
- std::string role(message->role);
17984
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17985
- }
17986
- if (add_ass) {
17987
- ss << "<|assistant|>\n";
17988
- }
17989
18603
  } else {
17990
18604
  // template not supported
17991
18605
  return -1;
@@ -18107,8 +18721,10 @@ const char * llama_print_system_info(void) {
18107
18721
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
18108
18722
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
18109
18723
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18724
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18110
18725
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18111
18726
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18727
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18112
18728
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18113
18729
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18114
18730
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
@@ -18167,6 +18783,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
18167
18783
  g_state.log_callback_user_data = user_data;
18168
18784
  #ifdef GGML_USE_METAL
18169
18785
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18786
+ #elif defined(GGML_USE_CUDA)
18787
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18170
18788
  #endif
18171
18789
  }
18172
18790