llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,16 +26,9 @@
26
26
  #ifdef GGML_USE_METAL
27
27
  # include "ggml-metal.h"
28
28
  #endif
29
- #ifdef GGML_USE_MPI
30
- # include "ggml-mpi.h"
31
- #endif
32
- #ifndef QK_K
33
- # ifdef GGML_QKK_64
34
- # define QK_K 64
35
- # else
36
- # define QK_K 256
37
- # endif
38
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
39
32
 
40
33
  #ifdef __has_include
41
34
  #if __has_include(<unistd.h>)
@@ -110,7 +103,7 @@
110
103
  #endif
111
104
 
112
105
  #define LLAMA_MAX_NODES 8192
113
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 128
114
107
 
115
108
  //
116
109
  // logging
@@ -205,7 +198,6 @@ enum llm_arch {
205
198
  LLM_ARCH_GPTNEOX,
206
199
  LLM_ARCH_MPT,
207
200
  LLM_ARCH_STARCODER,
208
- LLM_ARCH_PERSIMMON,
209
201
  LLM_ARCH_REFACT,
210
202
  LLM_ARCH_BERT,
211
203
  LLM_ARCH_NOMIC_BERT,
@@ -229,6 +221,7 @@ enum llm_arch {
229
221
  LLM_ARCH_COMMAND_R,
230
222
  LLM_ARCH_DBRX,
231
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
232
225
  LLM_ARCH_UNKNOWN,
233
226
  };
234
227
 
@@ -242,7 +235,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
242
235
  { LLM_ARCH_MPT, "mpt" },
243
236
  { LLM_ARCH_BAICHUAN, "baichuan" },
244
237
  { LLM_ARCH_STARCODER, "starcoder" },
245
- { LLM_ARCH_PERSIMMON, "persimmon" },
246
238
  { LLM_ARCH_REFACT, "refact" },
247
239
  { LLM_ARCH_BERT, "bert" },
248
240
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
@@ -266,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
258
  { LLM_ARCH_COMMAND_R, "command-r" },
267
259
  { LLM_ARCH_DBRX, "dbrx" },
268
260
  { LLM_ARCH_OLMO, "olmo" },
261
+ { LLM_ARCH_ARCTIC, "arctic" },
269
262
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
263
  };
271
264
 
@@ -309,6 +302,7 @@ enum llm_kv {
309
302
  LLM_KV_ROPE_SCALE_LINEAR,
310
303
  LLM_KV_ROPE_SCALING_TYPE,
311
304
  LLM_KV_ROPE_SCALING_FACTOR,
305
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
312
306
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
313
307
  LLM_KV_ROPE_SCALING_FINETUNED,
314
308
 
@@ -386,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
380
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
387
381
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
388
382
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
383
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
389
384
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
390
385
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
391
386
 
@@ -441,6 +436,8 @@ enum llm_tensor {
441
436
  LLM_TENSOR_OUTPUT,
442
437
  LLM_TENSOR_OUTPUT_NORM,
443
438
  LLM_TENSOR_ROPE_FREQS,
439
+ LLM_TENSOR_ROPE_FACTORS_LONG,
440
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
444
441
  LLM_TENSOR_ATTN_Q,
445
442
  LLM_TENSOR_ATTN_K,
446
443
  LLM_TENSOR_ATTN_V,
@@ -460,6 +457,7 @@ enum llm_tensor {
460
457
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
461
458
  LLM_TENSOR_FFN_GATE_EXP,
462
459
  LLM_TENSOR_FFN_UP_EXP,
460
+ LLM_TENSOR_FFN_NORM_EXPS,
463
461
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
464
462
  LLM_TENSOR_FFN_GATE_EXPS,
465
463
  LLM_TENSOR_FFN_UP_EXPS,
@@ -598,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
598
596
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
599
597
  },
600
598
  },
601
- {
602
- LLM_ARCH_PERSIMMON,
603
- {
604
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
605
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
606
- { LLM_TENSOR_OUTPUT, "output"},
607
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
608
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
609
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
610
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
611
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
612
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
613
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
614
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
615
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
616
- },
617
- },
618
599
  {
619
600
  LLM_ARCH_MPT,
620
601
  {
@@ -825,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
825
806
  {
826
807
  LLM_ARCH_PHI3,
827
808
  {
828
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
- { LLM_TENSOR_OUTPUT, "output" },
831
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
833
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
834
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
835
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
836
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
837
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
838
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
839
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
809
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
810
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
811
+ { LLM_TENSOR_OUTPUT, "output" },
812
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
813
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
816
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
817
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
840
823
  },
841
824
  },
842
825
  {
@@ -1052,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1052
1035
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1053
1036
  },
1054
1037
  },
1038
+ {
1039
+ LLM_ARCH_ARCTIC,
1040
+ {
1041
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1042
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1043
+ { LLM_TENSOR_OUTPUT, "output" },
1044
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1050
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1051
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1052
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1053
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1054
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1055
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1056
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1057
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
+ },
1059
+ },
1055
1060
  {
1056
1061
  LLM_ARCH_UNKNOWN,
1057
1062
  {
@@ -1697,6 +1702,8 @@ struct llama_state {
1697
1702
  llama_state() {
1698
1703
  #ifdef GGML_USE_METAL
1699
1704
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1705
+ #elif defined(GGML_USE_CUDA)
1706
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1700
1707
  #endif
1701
1708
  }
1702
1709
 
@@ -1710,17 +1717,24 @@ static llama_state g_state;
1710
1717
  // available llama models
1711
1718
  enum e_model {
1712
1719
  MODEL_UNKNOWN,
1720
+ MODEL_14M,
1713
1721
  MODEL_17M,
1714
1722
  MODEL_22M,
1715
1723
  MODEL_33M,
1724
+ MODEL_70M,
1716
1725
  MODEL_109M,
1717
1726
  MODEL_137M,
1727
+ MODEL_160M,
1718
1728
  MODEL_335M,
1729
+ MODEL_410M,
1719
1730
  MODEL_0_5B,
1720
1731
  MODEL_1B,
1732
+ MODEL_1_4B,
1721
1733
  MODEL_2B,
1734
+ MODEL_2_8B,
1722
1735
  MODEL_3B,
1723
1736
  MODEL_4B,
1737
+ MODEL_6_9B,
1724
1738
  MODEL_7B,
1725
1739
  MODEL_8B,
1726
1740
  MODEL_12B,
@@ -1743,6 +1757,7 @@ enum e_model {
1743
1757
  MODEL_8x7B,
1744
1758
  MODEL_8x22B,
1745
1759
  MODEL_16x12B,
1760
+ MODEL_10B_128x3_66B,
1746
1761
  };
1747
1762
 
1748
1763
  static const size_t kiB = 1024;
@@ -1752,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
1752
1767
  struct llama_hparams {
1753
1768
  bool vocab_only;
1754
1769
  bool rope_finetuned;
1770
+ bool use_par_res;
1755
1771
 
1756
1772
  uint32_t n_vocab;
1757
1773
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1770,6 +1786,7 @@ struct llama_hparams {
1770
1786
  float f_norm_eps;
1771
1787
  float f_norm_rms_eps;
1772
1788
 
1789
+ float rope_attn_factor = 1.0f;
1773
1790
  float rope_freq_base_train;
1774
1791
  float rope_freq_scale_train;
1775
1792
  uint32_t n_yarn_orig_ctx;
@@ -1818,6 +1835,7 @@ struct llama_hparams {
1818
1835
 
1819
1836
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1820
1837
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1838
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1821
1839
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1822
1840
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1823
1841
 
@@ -1915,6 +1933,7 @@ struct llama_layer {
1915
1933
  struct ggml_tensor * ffn_norm_b;
1916
1934
  struct ggml_tensor * layer_out_norm;
1917
1935
  struct ggml_tensor * layer_out_norm_b;
1936
+ struct ggml_tensor * ffn_norm_exps;
1918
1937
 
1919
1938
  // ff
1920
1939
  struct ggml_tensor * ffn_gate; // w1
@@ -1952,6 +1971,10 @@ struct llama_layer {
1952
1971
  // mamba bias
1953
1972
  struct ggml_tensor * ssm_conv1d_b;
1954
1973
  struct ggml_tensor * ssm_dt_b;
1974
+
1975
+ // long rope factors
1976
+ struct ggml_tensor * rope_long = nullptr;
1977
+ struct ggml_tensor * rope_short = nullptr;
1955
1978
  };
1956
1979
 
1957
1980
  struct llama_kv_cell {
@@ -2268,10 +2291,6 @@ struct llama_context {
2268
2291
 
2269
2292
  // control vectors
2270
2293
  struct llama_control_vector cvec;
2271
-
2272
- #ifdef GGML_USE_MPI
2273
- ggml_mpi_context * ctx_mpi = NULL;
2274
- #endif
2275
2294
  };
2276
2295
 
2277
2296
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
@@ -2491,7 +2510,6 @@ static bool llama_kv_cache_init(
2491
2510
  static bool llama_kv_cache_find_slot(
2492
2511
  struct llama_kv_cache & cache,
2493
2512
  const struct llama_batch & batch) {
2494
- const uint32_t n_ctx = cache.size;
2495
2513
  const uint32_t n_tokens = batch.n_tokens;
2496
2514
 
2497
2515
  if (cache.recurrent) {
@@ -2542,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
2542
2560
  }
2543
2561
  // otherwise, one cell per token.
2544
2562
 
2545
- if (n_tokens > n_ctx) {
2546
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2563
+ if (n_tokens > cache.size) {
2564
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2547
2565
  return false;
2548
2566
  }
2549
2567
 
2550
2568
  uint32_t n_tested = 0;
2551
2569
 
2552
2570
  while (true) {
2553
- if (cache.head + n_tokens > n_ctx) {
2554
- n_tested += n_ctx - cache.head;
2571
+ if (cache.head + n_tokens > cache.size) {
2572
+ n_tested += cache.size - cache.head;
2555
2573
  cache.head = 0;
2556
2574
  continue;
2557
2575
  }
@@ -2570,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
2570
2588
  break;
2571
2589
  }
2572
2590
 
2573
- if (n_tested >= n_ctx) {
2591
+ if (n_tested >= cache.size) {
2574
2592
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2575
2593
  return false;
2576
2594
  }
@@ -3330,6 +3348,39 @@ struct llama_model_loader {
3330
3348
  return get_arr_n(llm_kv(kid), result, required);
3331
3349
  }
3332
3350
 
3351
+ template<typename T>
3352
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3353
+ const int kid = gguf_find_key(meta, key.c_str());
3354
+
3355
+ if (kid < 0) {
3356
+ if (required) {
3357
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3358
+ }
3359
+ return false;
3360
+ }
3361
+
3362
+ struct GGUFMeta::ArrayInfo arr_info =
3363
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3364
+
3365
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3366
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3367
+ }
3368
+
3369
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3370
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3371
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3372
+
3373
+ result.resize(arr_info.length);
3374
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3375
+
3376
+ return true;
3377
+ }
3378
+
3379
+ template<typename T>
3380
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3381
+ return get_arr(llm_kv(kid), result, required);
3382
+ }
3383
+
3333
3384
  template<typename T>
3334
3385
  bool get_key(const std::string & key, T & result, const bool required = true) {
3335
3386
  auto it = kv_overrides.find(key);
@@ -3404,11 +3455,15 @@ struct llama_model_loader {
3404
3455
  return get_tensor_meta(get_tensor_name(i));
3405
3456
  }
3406
3457
 
3407
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3458
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3408
3459
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3409
3460
  ggml_set_name(tensor, ggml_get_name(cur));
3410
3461
 
3411
- n_created++;
3462
+ if (duplicated) {
3463
+ size_data += ggml_nbytes(cur);
3464
+ } else {
3465
+ n_created++;
3466
+ }
3412
3467
 
3413
3468
  return tensor;
3414
3469
  }
@@ -3443,14 +3498,17 @@ struct llama_model_loader {
3443
3498
  return cur;
3444
3499
  }
3445
3500
 
3446
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3447
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3501
+ static const int TENSOR_NOT_REQUIRED = 1;
3502
+ static const int TENSOR_DUPLICATED = 2;
3503
+
3504
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3505
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3448
3506
 
3449
3507
  if (cur == NULL) {
3450
3508
  return NULL;
3451
3509
  }
3452
3510
 
3453
- return create_tensor_for(ctx, cur);
3511
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3454
3512
  }
3455
3513
 
3456
3514
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3750,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3750
3808
 
3751
3809
  static const char * llama_model_type_name(e_model type) {
3752
3810
  switch (type) {
3753
- case MODEL_22M: return "22M";
3754
- case MODEL_33M: return "33M";
3755
- case MODEL_109M: return "109M";
3756
- case MODEL_137M: return "137M";
3757
- case MODEL_0_5B: return "0.5B";
3758
- case MODEL_1B: return "1B";
3759
- case MODEL_2B: return "2B";
3760
- case MODEL_3B: return "3B";
3761
- case MODEL_7B: return "7B";
3762
- case MODEL_8B: return "8B";
3763
- case MODEL_12B: return "12B";
3764
- case MODEL_13B: return "13B";
3765
- case MODEL_14B: return "14B";
3766
- case MODEL_15B: return "15B";
3767
- case MODEL_20B: return "20B";
3768
- case MODEL_30B: return "30B";
3769
- case MODEL_34B: return "34B";
3770
- case MODEL_35B: return "35B";
3771
- case MODEL_40B: return "40B";
3772
- case MODEL_65B: return "65B";
3773
- case MODEL_70B: return "70B";
3774
- case MODEL_314B: return "314B";
3775
- case MODEL_SMALL: return "0.1B";
3776
- case MODEL_MEDIUM: return "0.4B";
3777
- case MODEL_LARGE: return "0.8B";
3778
- case MODEL_XL: return "1.5B";
3779
- case MODEL_A2_7B: return "A2.7B";
3780
- case MODEL_8x7B: return "8x7B";
3781
- case MODEL_8x22B: return "8x22B";
3782
- case MODEL_16x12B: return "16x12B";
3783
- default: return "?B";
3811
+ case MODEL_14M: return "14M";
3812
+ case MODEL_17M: return "17M";
3813
+ case MODEL_22M: return "22M";
3814
+ case MODEL_33M: return "33M";
3815
+ case MODEL_70M: return "70M";
3816
+ case MODEL_109M: return "109M";
3817
+ case MODEL_137M: return "137M";
3818
+ case MODEL_160M: return "160M";
3819
+ case MODEL_335M: return "335M";
3820
+ case MODEL_410M: return "410M";
3821
+ case MODEL_0_5B: return "0.5B";
3822
+ case MODEL_1B: return "1B";
3823
+ case MODEL_1_4B: return "1.4B";
3824
+ case MODEL_2B: return "2B";
3825
+ case MODEL_2_8B: return "2.8B";
3826
+ case MODEL_3B: return "3B";
3827
+ case MODEL_4B: return "4B";
3828
+ case MODEL_6_9B: return "6.9B";
3829
+ case MODEL_7B: return "7B";
3830
+ case MODEL_8B: return "8B";
3831
+ case MODEL_12B: return "12B";
3832
+ case MODEL_13B: return "13B";
3833
+ case MODEL_14B: return "14B";
3834
+ case MODEL_15B: return "15B";
3835
+ case MODEL_20B: return "20B";
3836
+ case MODEL_30B: return "30B";
3837
+ case MODEL_34B: return "34B";
3838
+ case MODEL_35B: return "35B";
3839
+ case MODEL_40B: return "40B";
3840
+ case MODEL_65B: return "65B";
3841
+ case MODEL_70B: return "70B";
3842
+ case MODEL_314B: return "314B";
3843
+ case MODEL_SMALL: return "0.1B";
3844
+ case MODEL_MEDIUM: return "0.4B";
3845
+ case MODEL_LARGE: return "0.8B";
3846
+ case MODEL_XL: return "1.5B";
3847
+ case MODEL_A2_7B: return "A2.7B";
3848
+ case MODEL_8x7B: return "8x7B";
3849
+ case MODEL_8x22B: return "8x22B";
3850
+ case MODEL_16x12B: return "16x12B";
3851
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3852
+ default: return "?B";
3784
3853
  }
3785
3854
  }
3786
3855
 
@@ -3873,6 +3942,8 @@ static void llm_load_hparams(
3873
3942
  }
3874
3943
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3875
3944
 
3945
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
3946
+
3876
3947
  // sanity check for n_rot (optional)
3877
3948
  {
3878
3949
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3972,14 +4043,6 @@ static void llm_load_hparams(
3972
4043
  default: model.type = e_model::MODEL_UNKNOWN;
3973
4044
  }
3974
4045
  } break;
3975
- case LLM_ARCH_PERSIMMON:
3976
- {
3977
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3978
- switch (hparams.n_layer) {
3979
- case 36: model.type = e_model::MODEL_8B; break;
3980
- default: model.type = e_model::MODEL_UNKNOWN;
3981
- }
3982
- } break;
3983
4046
  case LLM_ARCH_REFACT:
3984
4047
  {
3985
4048
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -4121,6 +4184,7 @@ static void llm_load_hparams(
4121
4184
  switch (hparams.n_layer) {
4122
4185
  case 24: model.type = e_model::MODEL_1B; break;
4123
4186
  case 32: model.type = e_model::MODEL_3B; break;
4187
+ case 40: model.type = e_model::MODEL_14B; break;
4124
4188
  default: model.type = e_model::MODEL_UNKNOWN;
4125
4189
  }
4126
4190
  } break;
@@ -4261,6 +4325,65 @@ static void llm_load_hparams(
4261
4325
  default: model.type = e_model::MODEL_UNKNOWN;
4262
4326
  }
4263
4327
  } break;
4328
+ case LLM_ARCH_GPTNEOX:
4329
+ {
4330
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4331
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4332
+ switch (hparams.n_layer) {
4333
+ case 6:
4334
+ switch (hparams.n_ff) {
4335
+ case 512: model.type = e_model::MODEL_14M; break;
4336
+ case 2048: model.type = e_model::MODEL_70M; break;
4337
+ default: model.type = e_model::MODEL_UNKNOWN;
4338
+ } break;
4339
+ case 12:
4340
+ switch (hparams.n_ff) {
4341
+ case 3072: model.type = e_model::MODEL_160M; break;
4342
+ default: model.type = e_model::MODEL_UNKNOWN;
4343
+ } break;
4344
+ case 16:
4345
+ switch (hparams.n_ff) {
4346
+ case 8192: model.type = e_model::MODEL_1B; break;
4347
+ default: model.type = e_model::MODEL_UNKNOWN;
4348
+ } break;
4349
+ case 24:
4350
+ switch (hparams.n_ff) {
4351
+ case 4096: model.type = e_model::MODEL_410M; break;
4352
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4353
+ default: model.type = e_model::MODEL_UNKNOWN;
4354
+ } break;
4355
+ case 32:
4356
+ switch (hparams.n_ff) {
4357
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4358
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4359
+ default: model.type = e_model::MODEL_UNKNOWN;
4360
+ } break;
4361
+ case 36:
4362
+ switch (hparams.n_ff) {
4363
+ case 20480: model.type = e_model::MODEL_12B; break;
4364
+ default: model.type = e_model::MODEL_UNKNOWN;
4365
+ } break;
4366
+ case 44:
4367
+ switch (hparams.n_ff) {
4368
+ case 24576: model.type = e_model::MODEL_20B; break;
4369
+ default: model.type = e_model::MODEL_UNKNOWN;
4370
+ } break;
4371
+ default: model.type = e_model::MODEL_UNKNOWN;
4372
+ }
4373
+ } break;
4374
+ case LLM_ARCH_ARCTIC:
4375
+ {
4376
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4377
+
4378
+ if (hparams.n_expert == 128) {
4379
+ switch (hparams.n_layer) {
4380
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4381
+ default: model.type = e_model::MODEL_UNKNOWN;
4382
+ }
4383
+ } else {
4384
+ model.type = e_model::MODEL_UNKNOWN;
4385
+ }
4386
+ } break;
4264
4387
  default: (void)0;
4265
4388
  }
4266
4389
 
@@ -4461,6 +4584,9 @@ static void llm_load_vocab(
4461
4584
  } else if (
4462
4585
  tokenizer_pre == "qwen2") {
4463
4586
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4587
+ } else if (
4588
+ tokenizer_pre == "stablelm2") {
4589
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4464
4590
  } else if (
4465
4591
  tokenizer_pre == "olmo") {
4466
4592
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -4582,7 +4708,8 @@ static void llm_load_vocab(
4582
4708
  (t.first == "<|eot_id|>" ||
4583
4709
  t.first == "<|im_end|>" ||
4584
4710
  t.first == "<|end|>" ||
4585
- t.first == "<end_of_turn>"
4711
+ t.first == "<end_of_turn>" ||
4712
+ t.first == "<|endoftext|>"
4586
4713
  )
4587
4714
  ) {
4588
4715
  vocab.special_eot_id = t.second;
@@ -4908,6 +5035,7 @@ static bool llm_load_tensors(
4908
5035
  // create tensors for the weights
4909
5036
  {
4910
5037
  const int64_t n_embd = hparams.n_embd;
5038
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4911
5039
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4912
5040
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4913
5041
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4942,12 +5070,10 @@ static bool llm_load_tensors(
4942
5070
  {
4943
5071
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4944
5072
  if (model.arch != LLM_ARCH_MINICPM){
4945
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5073
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4946
5074
  // if output is NULL, init from the input tok embed
4947
5075
  if (model.output == NULL) {
4948
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4949
- ml.n_created--; // artificial tensor
4950
- ml.size_data += ggml_nbytes(model.output);
5076
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4951
5077
  }
4952
5078
  }
4953
5079
  }
@@ -4966,10 +5092,10 @@ static bool llm_load_tensors(
4966
5092
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4967
5093
 
4968
5094
  // optional bias tensors
4969
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4970
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4971
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4972
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5095
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5096
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5097
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5098
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4973
5099
 
4974
5100
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4975
5101
 
@@ -4980,7 +5106,7 @@ static bool llm_load_tensors(
4980
5106
  } else {
4981
5107
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4982
5108
 
4983
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5109
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4984
5110
  if (layer.ffn_gate_exps) {
4985
5111
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4986
5112
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5022,12 +5148,10 @@ static bool llm_load_tensors(
5022
5148
  // output
5023
5149
  {
5024
5150
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5025
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5151
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5026
5152
  // if output is NULL, init from the input tok embed
5027
5153
  if (model.output == NULL) {
5028
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5029
- ml.n_created--; // artificial tensor
5030
- ml.size_data += ggml_nbytes(model.output);
5154
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5031
5155
  }
5032
5156
  }
5033
5157
 
@@ -5050,7 +5174,7 @@ static bool llm_load_tensors(
5050
5174
 
5051
5175
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5052
5176
 
5053
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5177
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
5054
5178
  if (layer.ffn_gate_exps) {
5055
5179
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
5056
5180
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5152,11 +5276,9 @@ static bool llm_load_tensors(
5152
5276
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5153
5277
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5154
5278
 
5155
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5279
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
5280
  if (!model.output) {
5157
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5158
- ml.n_created--; // artificial tensor
5159
- ml.size_data += ggml_nbytes(model.output);
5281
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5160
5282
  }
5161
5283
  }
5162
5284
 
@@ -5169,8 +5291,8 @@ static bool llm_load_tensors(
5169
5291
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5170
5292
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5171
5293
 
5172
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5173
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5294
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5295
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5174
5296
 
5175
5297
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5176
5298
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5188,7 +5310,12 @@ static bool llm_load_tensors(
5188
5310
  {
5189
5311
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5190
5312
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5191
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5313
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5314
+ if (!model.output) {
5315
+ // needs to be on GPU
5316
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5317
+ }
5318
+
5192
5319
  }
5193
5320
 
5194
5321
  for (int i = 0; i < n_layer; ++i) {
@@ -5216,47 +5343,6 @@ static bool llm_load_tensors(
5216
5343
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5217
5344
  }
5218
5345
  } break;
5219
- case LLM_ARCH_PERSIMMON:
5220
- {
5221
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5222
-
5223
- {
5224
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5225
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5226
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5227
- }
5228
-
5229
- for (int i = 0; i < n_layer; ++i) {
5230
- ggml_context * ctx_layer = ctx_for_layer(i);
5231
- ggml_context * ctx_split = ctx_for_layer_split(i);
5232
-
5233
- auto & layer = model.layers[i];
5234
-
5235
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5236
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5237
-
5238
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5239
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5240
-
5241
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5242
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5243
-
5244
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5245
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5246
-
5247
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5248
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5249
-
5250
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5251
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5252
-
5253
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5254
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5255
-
5256
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5257
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5258
- }
5259
- } break;
5260
5346
  case LLM_ARCH_BERT:
5261
5347
  case LLM_ARCH_NOMIC_BERT:
5262
5348
  {
@@ -5325,14 +5411,14 @@ static bool llm_load_tensors(
5325
5411
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
5412
  layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
5413
 
5328
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5414
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5416
 
5331
5417
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
5418
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
5419
 
5334
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5420
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5421
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5336
5422
 
5337
5423
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
5424
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
@@ -5394,18 +5480,16 @@ static bool llm_load_tensors(
5394
5480
  case LLM_ARCH_MPT:
5395
5481
  {
5396
5482
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5397
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5483
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5398
5484
 
5399
5485
  // output
5400
5486
  {
5401
5487
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5402
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5488
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5403
5489
 
5404
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5490
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5405
5491
  if (!model.output) {
5406
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5407
- ml.n_created--; // artificial tensor
5408
- ml.size_data += ggml_nbytes(model.output);
5492
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5409
5493
  }
5410
5494
  }
5411
5495
 
@@ -5416,31 +5500,31 @@ static bool llm_load_tensors(
5416
5500
  auto & layer = model.layers[i];
5417
5501
 
5418
5502
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5419
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5503
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5420
5504
 
5421
5505
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5422
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5506
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5423
5507
 
5424
5508
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5425
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5509
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5426
5510
 
5427
5511
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5428
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5512
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5429
5513
 
5430
5514
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5431
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5515
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5432
5516
 
5433
5517
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5434
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5518
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5435
5519
 
5436
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5437
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5520
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5521
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5438
5522
 
5439
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5440
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5523
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5524
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5441
5525
 
5442
5526
  // AWQ ScaleActivation layer
5443
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5527
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5444
5528
  }
5445
5529
  } break;
5446
5530
  case LLM_ARCH_STABLELM:
@@ -5469,17 +5553,17 @@ static bool llm_load_tensors(
5469
5553
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5470
5554
 
5471
5555
  // optional bias tensors, present in Stable LM 2 1.6B
5472
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5473
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5474
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5556
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5557
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5558
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5475
5559
 
5476
5560
  // optional q and k layernorms, present in StableLM 2 12B
5477
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5478
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5561
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5562
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5479
5563
 
5480
5564
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5481
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5482
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5565
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5566
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5483
5567
 
5484
5568
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5485
5569
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5522,12 +5606,10 @@ static bool llm_load_tensors(
5522
5606
  // output
5523
5607
  {
5524
5608
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5525
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5609
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5526
5610
  // if output is NULL, init from the input tok embed
5527
5611
  if (model.output == NULL) {
5528
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5529
- ml.n_created--; // artificial tensor
5530
- ml.size_data += ggml_nbytes(model.output);
5612
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5531
5613
  }
5532
5614
  }
5533
5615
 
@@ -5625,8 +5707,8 @@ static bool llm_load_tensors(
5625
5707
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5626
5708
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5627
5709
 
5628
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5629
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5710
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5711
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5630
5712
 
5631
5713
  if (layer.wqkv == nullptr) {
5632
5714
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5663,17 +5745,20 @@ static bool llm_load_tensors(
5663
5745
  ggml_context* ctx_layer = ctx_for_layer(i);
5664
5746
  ggml_context* ctx_split = ctx_for_layer_split(i);
5665
5747
 
5666
- auto& layer = model.layers[i];
5748
+ auto & layer = model.layers[i];
5667
5749
 
5668
5750
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5669
5751
 
5670
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5671
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5752
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5753
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5672
5754
 
5673
5755
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5674
5756
 
5675
5757
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5676
5758
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5759
+
5760
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5761
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5677
5762
  }
5678
5763
  } break;
5679
5764
  case LLM_ARCH_PLAMO:
@@ -5842,9 +5927,7 @@ static bool llm_load_tensors(
5842
5927
 
5843
5928
  // output
5844
5929
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5845
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5846
- ml.n_created--; // artificial tensor
5847
- ml.size_data += ggml_nbytes(model.output);
5930
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5848
5931
 
5849
5932
  const int64_t n_ff = hparams.n_ff;
5850
5933
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5879,12 +5962,10 @@ static bool llm_load_tensors(
5879
5962
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5880
5963
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5881
5964
 
5882
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5965
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5883
5966
  // if output is NULL, init from the input tok embed
5884
5967
  if (model.output == NULL) {
5885
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5886
- ml.n_created--; // artificial tensor
5887
- ml.size_data += ggml_nbytes(model.output);
5968
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5888
5969
  }
5889
5970
 
5890
5971
  }
@@ -5935,12 +6016,10 @@ static bool llm_load_tensors(
5935
6016
  {
5936
6017
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5937
6018
 
5938
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6019
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5939
6020
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5940
6021
  if (model.output == NULL) {
5941
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5942
- ml.n_created--; // artificial tensor
5943
- ml.size_data += ggml_nbytes(model.output);
6022
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5944
6023
  }
5945
6024
  }
5946
6025
 
@@ -6001,9 +6080,7 @@ static bool llm_load_tensors(
6001
6080
  {
6002
6081
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6003
6082
  // init output from the input tok embed
6004
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6005
- ml.n_created--; // artificial tensor
6006
- ml.size_data += ggml_nbytes(model.output);
6083
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6007
6084
  }
6008
6085
 
6009
6086
  for (int i = 0; i < n_layer; ++i) {
@@ -6035,12 +6112,10 @@ static bool llm_load_tensors(
6035
6112
 
6036
6113
  // output
6037
6114
  {
6038
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6115
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6039
6116
  // if output is NULL, init from the input tok embed
6040
6117
  if (model.output == NULL) {
6041
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6042
- ml.n_created--; // artificial tensor
6043
- ml.size_data += ggml_nbytes(model.output);
6118
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6044
6119
  }
6045
6120
  }
6046
6121
 
@@ -6060,6 +6135,81 @@ static bool llm_load_tensors(
6060
6135
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6061
6136
  }
6062
6137
  } break;
6138
+ case LLM_ARCH_GPTNEOX:
6139
+ {
6140
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6141
+ // output
6142
+ {
6143
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6144
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6145
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6146
+ }
6147
+
6148
+ for (int i = 0; i < n_layer; ++i) {
6149
+ ggml_context * ctx_layer = ctx_for_layer(i);
6150
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6151
+
6152
+ auto & layer = model.layers[i];
6153
+
6154
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6155
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6156
+
6157
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6158
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6159
+
6160
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6161
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6162
+
6163
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6164
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6165
+
6166
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6167
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6168
+
6169
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6170
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6171
+ }
6172
+ } break;
6173
+ case LLM_ARCH_ARCTIC:
6174
+ {
6175
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6176
+
6177
+ // output
6178
+ {
6179
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6180
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6181
+ // if output is NULL, init from the input tok embed
6182
+ if (model.output == NULL) {
6183
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6184
+ }
6185
+ }
6186
+
6187
+ for (int i = 0; i < n_layer; ++i) {
6188
+ ggml_context * ctx_layer = ctx_for_layer(i);
6189
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6190
+
6191
+ auto & layer = model.layers[i];
6192
+
6193
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6194
+
6195
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6196
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6197
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6198
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6199
+
6200
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6201
+
6202
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6203
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6204
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6205
+
6206
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6207
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6208
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6209
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6210
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
+ }
6212
+ } break;
6063
6213
  default:
6064
6214
  throw std::runtime_error("unknown architecture");
6065
6215
  }
@@ -6324,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6324
6474
 
6325
6475
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6326
6476
  } else {
6327
- #ifdef GGML_USE_MPI
6328
- GGML_ASSERT(false && "not implemented");
6329
- #endif
6330
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6477
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6331
6478
  inpL = lctx.inp_embd;
6332
6479
  ggml_set_input(lctx.inp_embd);
6333
6480
  }
@@ -6652,7 +6799,7 @@ static struct ggml_tensor * llm_build_kqv(
6652
6799
 
6653
6800
  cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6654
6801
 
6655
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6802
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6656
6803
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6657
6804
  }
6658
6805
 
@@ -6661,7 +6808,7 @@ static struct ggml_tensor * llm_build_kqv(
6661
6808
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6662
6809
  cb(kq, "kq", il);
6663
6810
 
6664
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6811
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6665
6812
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6666
6813
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6667
6814
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6886,17 +7033,20 @@ struct llm_build_context {
6886
7033
  cb(lctx.inp_K_shift, "K_shift", -1);
6887
7034
  ggml_set_input(lctx.inp_K_shift);
6888
7035
 
7036
+
6889
7037
  for (int il = 0; il < n_layer; ++il) {
7038
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6890
7039
  struct ggml_tensor * tmp =
6891
7040
  // we rotate only the first n_rot dimensions
6892
- ggml_rope_custom_inplace(ctx0,
7041
+ ggml_rope_ext_inplace(ctx0,
6893
7042
  ggml_view_3d(ctx0, kv_self.k_l[il],
6894
7043
  n_embd_head_k, n_head_kv, n_ctx,
6895
7044
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6896
7045
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6897
7046
  0),
6898
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6899
7048
  ext_factor, attn_factor, beta_fast, beta_slow);
7049
+
6900
7050
  cb(tmp, "K_shifted", il);
6901
7051
  ggml_build_forward_expand(gf, tmp);
6902
7052
  }
@@ -6999,6 +7149,17 @@ struct llm_build_context {
6999
7149
  return lctx.inp_pos;
7000
7150
  }
7001
7151
 
7152
+ struct ggml_tensor * build_rope_factors(int il) {
7153
+ // choose long/short freq factors based on the context size
7154
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7155
+
7156
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7157
+ return model.layers[il].rope_long;
7158
+ }
7159
+
7160
+ return model.layers[il].rope_short;
7161
+ }
7162
+
7002
7163
  struct ggml_tensor * build_inp_out_ids() {
7003
7164
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
7004
7165
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -7106,15 +7267,15 @@ struct llm_build_context {
7106
7267
  cb(Vcur, "Vcur", il);
7107
7268
  }
7108
7269
 
7109
- Qcur = ggml_rope_custom(
7110
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7270
+ Qcur = ggml_rope_ext(
7271
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7111
7272
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7112
7273
  ext_factor, attn_factor, beta_fast, beta_slow
7113
7274
  );
7114
7275
  cb(Qcur, "Qcur", il);
7115
7276
 
7116
- Kcur = ggml_rope_custom(
7117
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7277
+ Kcur = ggml_rope_ext(
7278
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7118
7279
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7119
7280
  ext_factor, attn_factor, beta_fast, beta_slow
7120
7281
  );
@@ -7236,13 +7397,13 @@ struct llm_build_context {
7236
7397
 
7237
7398
  switch (model.type) {
7238
7399
  case MODEL_7B:
7239
- Qcur = ggml_rope_custom(
7240
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ Qcur = ggml_rope_ext(
7401
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7241
7402
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7242
7403
  ext_factor, attn_factor, beta_fast, beta_slow
7243
7404
  );
7244
- Kcur = ggml_rope_custom(
7245
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7405
+ Kcur = ggml_rope_ext(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7246
7407
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7408
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7409
  );
@@ -7348,15 +7509,15 @@ struct llm_build_context {
7348
7509
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7349
7510
  cb(Vcur, "Vcur", il);
7350
7511
 
7351
- Qcur = ggml_rope_custom(
7352
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7512
+ Qcur = ggml_rope_ext(
7513
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7353
7514
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7354
7515
  ext_factor, attn_factor, beta_fast, beta_slow
7355
7516
  );
7356
7517
  cb(Qcur, "Qcur", il);
7357
7518
 
7358
- Kcur = ggml_rope_custom(
7359
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7519
+ Kcur = ggml_rope_ext(
7520
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7360
7521
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7361
7522
  ext_factor, attn_factor, beta_fast, beta_slow
7362
7523
  );
@@ -7469,14 +7630,14 @@ struct llm_build_context {
7469
7630
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7470
7631
 
7471
7632
  // using mode = 2 for neox mode
7472
- Qcur = ggml_rope_custom(
7473
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7633
+ Qcur = ggml_rope_ext(
7634
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7474
7635
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7475
7636
  );
7476
7637
  cb(Qcur, "Qcur", il);
7477
7638
 
7478
- Kcur = ggml_rope_custom(
7479
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7639
+ Kcur = ggml_rope_ext(
7640
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7480
7641
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7481
7642
  );
7482
7643
  cb(Kcur, "Kcur", il);
@@ -7592,15 +7753,15 @@ struct llm_build_context {
7592
7753
  cb(Vcur, "Vcur", il);
7593
7754
  }
7594
7755
 
7595
- Qcur = ggml_rope_custom(
7596
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7756
+ Qcur = ggml_rope_ext(
7757
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7597
7758
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7598
7759
  ext_factor, attn_factor, beta_fast, beta_slow
7599
7760
  );
7600
7761
  cb(Qcur, "Qcur", il);
7601
7762
 
7602
- Kcur = ggml_rope_custom(
7603
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7763
+ Kcur = ggml_rope_ext(
7764
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7604
7765
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
7766
  ext_factor, attn_factor, beta_fast, beta_slow
7606
7767
  );
@@ -7744,15 +7905,15 @@ struct llm_build_context {
7744
7905
  cb(Kcur, "Kcur", il);
7745
7906
  cb(Vcur, "Vcur", il);
7746
7907
 
7747
- Qcur = ggml_rope_custom(
7748
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7908
+ Qcur = ggml_rope_ext(
7909
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7749
7910
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7750
7911
  ext_factor, attn_factor, beta_fast, beta_slow
7751
7912
  );
7752
7913
  cb(Qcur, "Qcur", il);
7753
7914
 
7754
- Kcur = ggml_rope_custom(
7755
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7915
+ Kcur = ggml_rope_ext(
7916
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7756
7917
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7757
7918
  ext_factor, attn_factor, beta_fast, beta_slow
7758
7919
  );
@@ -7921,256 +8082,49 @@ struct llm_build_context {
7921
8082
  return gf;
7922
8083
  }
7923
8084
 
7924
- struct ggml_cgraph * build_persimmon() {
8085
+ struct ggml_cgraph * build_refact() {
7925
8086
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7926
8087
 
7927
8088
  const int64_t n_embd_head = hparams.n_embd_head_v;
7928
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7929
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
8089
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7930
8090
 
7931
8091
  struct ggml_tensor * cur;
7932
8092
  struct ggml_tensor * inpL;
7933
8093
 
7934
8094
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7935
8095
 
7936
- // inp_pos - contains the positions
7937
- struct ggml_tensor * inp_pos = build_inp_pos();
7938
-
7939
8096
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7940
8097
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7941
8098
 
7942
8099
  for (int il = 0; il < n_layer; ++il) {
7943
- struct ggml_tensor * residual = inpL;
8100
+ struct ggml_tensor * inpSA = inpL;
7944
8101
 
7945
8102
  cur = llm_build_norm(ctx0, inpL, hparams,
7946
- model.layers[il].attn_norm,
7947
- model.layers[il].attn_norm_b,
7948
- LLM_NORM, cb, il);
8103
+ model.layers[il].attn_norm, NULL,
8104
+ LLM_NORM_RMS, cb, il);
7949
8105
  cb(cur, "attn_norm", il);
7950
8106
 
7951
- // self attention
8107
+ // self-attention
7952
8108
  {
7953
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7954
- cb(cur, "wqkv", il);
8109
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8110
+ cb(Qcur, "Qcur", il);
7955
8111
 
7956
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7957
- cb(cur, "bqkv", il);
8112
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8113
+ cb(Kcur, "Kcur", il);
7958
8114
 
7959
- // split qkv
7960
- GGML_ASSERT(n_head_kv == n_head);
8115
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8116
+ cb(Vcur, "Vcur", il);
7961
8117
 
7962
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7963
- cb(tmpqkv, "tmpqkv", il);
8118
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8119
+ cb(Kcur, "Kcur", il);
7964
8120
 
7965
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7966
- cb(tmpqkv_perm, "tmpqkv", il);
8121
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8122
+ cb(Qcur, "Qcur", il);
7967
8123
 
7968
- struct ggml_tensor * tmpq = ggml_view_3d(
7969
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7970
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7971
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7972
- 0
7973
- );
7974
- cb(tmpq, "tmpq", il);
7975
-
7976
- struct ggml_tensor * tmpk = ggml_view_3d(
7977
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7978
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7979
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7980
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7981
- );
7982
- cb(tmpk, "tmpk", il);
7983
-
7984
- // Q/K Layernorm
7985
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7986
- model.layers[il].attn_q_norm,
7987
- model.layers[il].attn_q_norm_b,
7988
- LLM_NORM, cb, il);
7989
- cb(tmpq, "tmpq", il);
7990
-
7991
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7992
- model.layers[il].attn_k_norm,
7993
- model.layers[il].attn_k_norm_b,
7994
- LLM_NORM, cb, il);
7995
- cb(tmpk, "tmpk", il);
7996
-
7997
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7998
- struct ggml_tensor * qrot = ggml_view_3d(
7999
- ctx0, tmpq, n_rot, n_head, n_tokens,
8000
- ggml_element_size(tmpq) * n_embd_head,
8001
- ggml_element_size(tmpq) * n_embd_head * n_head,
8002
- 0
8003
- );
8004
- cb(qrot, "qrot", il);
8005
-
8006
- struct ggml_tensor * krot = ggml_view_3d(
8007
- ctx0, tmpk, n_rot, n_head, n_tokens,
8008
- ggml_element_size(tmpk) * n_embd_head,
8009
- ggml_element_size(tmpk) * n_embd_head * n_head,
8010
- 0
8011
- );
8012
- cb(krot, "krot", il);
8013
-
8014
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
8015
- struct ggml_tensor * qpass = ggml_view_3d(
8016
- ctx0, tmpq, n_rot, n_head, n_tokens,
8017
- ggml_element_size(tmpq) * n_embd_head,
8018
- ggml_element_size(tmpq) * n_embd_head * n_head,
8019
- ggml_element_size(tmpq) * n_rot
8020
- );
8021
- cb(qpass, "qpass", il);
8022
-
8023
- struct ggml_tensor * kpass = ggml_view_3d(
8024
- ctx0, tmpk, n_rot, n_head, n_tokens,
8025
- ggml_element_size(tmpk) * n_embd_head,
8026
- ggml_element_size(tmpk) * n_embd_head * n_head,
8027
- ggml_element_size(tmpk) * n_rot
8028
- );
8029
- cb(kpass, "kpass", il);
8030
-
8031
- struct ggml_tensor * qrotated = ggml_rope_custom(
8032
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8033
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8034
- );
8035
- cb(qrotated, "qrotated", il);
8036
-
8037
- struct ggml_tensor * krotated = ggml_rope_custom(
8038
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8039
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8040
- );
8041
- cb(krotated, "krotated", il);
8042
-
8043
- // ggml currently only supports concatenation on dim=2
8044
- // so we need to permute qrot, qpass, concat, then permute back.
8045
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
8046
- cb(qrotated, "qrotated", il);
8047
-
8048
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
8049
- cb(krotated, "krotated", il);
8050
-
8051
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
8052
- cb(qpass, "qpass", il);
8053
-
8054
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
8055
- cb(kpass, "kpass", il);
8056
-
8057
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
8058
- cb(Qcur, "Qcur", il);
8059
-
8060
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
8061
- cb(Kcur, "Kcur", il);
8062
-
8063
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
8064
- cb(Q, "Q", il);
8065
-
8066
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8067
- cb(Kcur, "Kcur", il);
8068
-
8069
- struct ggml_tensor * Vcur = ggml_view_3d(
8070
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8071
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8072
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8073
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8074
- );
8075
- cb(Vcur, "Vcur", il);
8076
-
8077
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8078
- model.layers[il].wo, model.layers[il].bo,
8079
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8080
- }
8081
-
8082
- if (il == n_layer - 1) {
8083
- // skip computing output for unused tokens
8084
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8085
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8086
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8087
- }
8088
-
8089
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8090
- cb(ffn_inp, "ffn_inp", il);
8091
-
8092
- // feed-forward network
8093
- {
8094
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8095
- model.layers[il].ffn_norm,
8096
- model.layers[il].ffn_norm_b,
8097
- LLM_NORM, cb, il);
8098
- cb(cur, "ffn_norm", il);
8099
-
8100
- cur = llm_build_ffn(ctx0, cur,
8101
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8102
- NULL, NULL,
8103
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8104
- NULL,
8105
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8106
- cb(cur, "ffn_out", il);
8107
- }
8108
-
8109
- cur = ggml_add(ctx0, cur, ffn_inp);
8110
- cb(cur, "l_out", il);
8111
-
8112
- inpL = cur;
8113
- }
8114
-
8115
- cur = inpL;
8116
-
8117
- cur = llm_build_norm(ctx0, cur, hparams,
8118
- model.output_norm,
8119
- model.output_norm_b,
8120
- LLM_NORM, cb, -1);
8121
- cb(cur, "result_norm", -1);
8122
-
8123
- cur = ggml_mul_mat(ctx0, model.output, cur);
8124
- cb(cur, "result_output", -1);
8125
-
8126
- ggml_build_forward_expand(gf, cur);
8127
-
8128
- return gf;
8129
- }
8130
-
8131
- struct ggml_cgraph * build_refact() {
8132
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8133
-
8134
- const int64_t n_embd_head = hparams.n_embd_head_v;
8135
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8136
-
8137
- struct ggml_tensor * cur;
8138
- struct ggml_tensor * inpL;
8139
-
8140
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8141
-
8142
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8143
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8144
-
8145
- for (int il = 0; il < n_layer; ++il) {
8146
- struct ggml_tensor * inpSA = inpL;
8147
-
8148
- cur = llm_build_norm(ctx0, inpL, hparams,
8149
- model.layers[il].attn_norm, NULL,
8150
- LLM_NORM_RMS, cb, il);
8151
- cb(cur, "attn_norm", il);
8152
-
8153
- // self-attention
8154
- {
8155
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8156
- cb(Qcur, "Qcur", il);
8157
-
8158
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8159
- cb(Kcur, "Kcur", il);
8160
-
8161
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8162
- cb(Vcur, "Vcur", il);
8163
-
8164
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8165
- cb(Kcur, "Kcur", il);
8166
-
8167
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8168
- cb(Qcur, "Qcur", il);
8169
-
8170
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8171
- model.layers[il].wo, NULL,
8172
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8173
- }
8124
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8125
+ model.layers[il].wo, NULL,
8126
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8127
+ }
8174
8128
 
8175
8129
  if (il == n_layer - 1) {
8176
8130
  // skip computing output for unused tokens
@@ -8304,15 +8258,15 @@ struct llm_build_context {
8304
8258
  cb(Kcur, "Kcur", il);
8305
8259
  cb(Vcur, "Vcur", il);
8306
8260
 
8307
- Qcur = ggml_rope_custom(
8308
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8261
+ Qcur = ggml_rope_ext(
8262
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8309
8263
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8310
8264
  ext_factor, attn_factor, beta_fast, beta_slow
8311
8265
  );
8312
8266
  cb(Qcur, "Qcur", il);
8313
8267
 
8314
- Kcur = ggml_rope_custom(
8315
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8268
+ Kcur = ggml_rope_ext(
8269
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8316
8270
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8317
8271
  ext_factor, attn_factor, beta_fast, beta_slow
8318
8272
  );
@@ -8744,15 +8698,15 @@ struct llm_build_context {
8744
8698
  }
8745
8699
 
8746
8700
 
8747
- Qcur = ggml_rope_custom(
8748
- ctx0, Qcur, inp_pos,
8701
+ Qcur = ggml_rope_ext(
8702
+ ctx0, Qcur, inp_pos, nullptr,
8749
8703
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8750
8704
  ext_factor, attn_factor, beta_fast, beta_slow
8751
8705
  );
8752
8706
  cb(Qcur, "Qcur", il);
8753
8707
 
8754
- Kcur = ggml_rope_custom(
8755
- ctx0, Kcur, inp_pos,
8708
+ Kcur = ggml_rope_ext(
8709
+ ctx0, Kcur, inp_pos, nullptr,
8756
8710
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8757
8711
  ext_factor, attn_factor, beta_fast, beta_slow
8758
8712
  );
@@ -8864,14 +8818,14 @@ struct llm_build_context {
8864
8818
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8865
8819
 
8866
8820
  // using mode = 2 for neox mode
8867
- Qcur = ggml_rope_custom(
8868
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8821
+ Qcur = ggml_rope_ext(
8822
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8869
8823
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8870
8824
  );
8871
8825
  cb(Qcur, "Qcur", il);
8872
8826
 
8873
- Kcur = ggml_rope_custom(
8874
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8827
+ Kcur = ggml_rope_ext(
8828
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8875
8829
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8876
8830
  );
8877
8831
  cb(Kcur, "Kcur", il);
@@ -8975,15 +8929,15 @@ struct llm_build_context {
8975
8929
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8976
8930
  cb(Vcur, "Vcur", il);
8977
8931
 
8978
- Qcur = ggml_rope_custom(
8979
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8932
+ Qcur = ggml_rope_ext(
8933
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8980
8934
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8981
8935
  ext_factor, attn_factor, beta_fast, beta_slow
8982
8936
  );
8983
8937
  cb(Qcur, "Qcur", il);
8984
8938
 
8985
- Kcur = ggml_rope_custom(
8986
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8939
+ Kcur = ggml_rope_ext(
8940
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8987
8941
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8988
8942
  ext_factor, attn_factor, beta_fast, beta_slow
8989
8943
  );
@@ -9089,15 +9043,15 @@ struct llm_build_context {
9089
9043
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9090
9044
  cb(Vcur, "Vcur", il);
9091
9045
 
9092
- Qcur = ggml_rope_custom(
9093
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9046
+ Qcur = ggml_rope_ext(
9047
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9094
9048
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9095
9049
  ext_factor, attn_factor, beta_fast, beta_slow
9096
9050
  );
9097
9051
  cb(Qcur, "Qcur", il);
9098
9052
 
9099
- Kcur = ggml_rope_custom(
9100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9053
+ Kcur = ggml_rope_ext(
9054
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9101
9055
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9102
9056
  ext_factor, attn_factor, beta_fast, beta_slow
9103
9057
  );
@@ -9241,8 +9195,8 @@ struct llm_build_context {
9241
9195
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9242
9196
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9243
9197
 
9244
- Qcur = ggml_rope_custom(
9245
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9198
+ Qcur = ggml_rope_ext(
9199
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9246
9200
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9247
9201
  );
9248
9202
  cb(Qcur, "Qcur", il);
@@ -9252,8 +9206,8 @@ struct llm_build_context {
9252
9206
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9253
9207
  cb(Qcur, "Qcur", il);
9254
9208
 
9255
- Kcur = ggml_rope_custom(
9256
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9209
+ Kcur = ggml_rope_ext(
9210
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9257
9211
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9258
9212
  );
9259
9213
  cb(Kcur, "Kcur", il);
@@ -9329,6 +9283,9 @@ struct llm_build_context {
9329
9283
 
9330
9284
  // self-attention
9331
9285
  {
9286
+ // rope freq factors for 128k context
9287
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9288
+
9332
9289
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9333
9290
  model.layers[il].attn_norm,
9334
9291
  NULL,
@@ -9360,8 +9317,8 @@ struct llm_build_context {
9360
9317
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9361
9318
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9362
9319
 
9363
- Qcur = ggml_rope_custom(
9364
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9320
+ Qcur = ggml_rope_ext(
9321
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9365
9322
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9366
9323
  );
9367
9324
  cb(Qcur, "Qcur", il);
@@ -9369,8 +9326,8 @@ struct llm_build_context {
9369
9326
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9370
9327
  cb(Qcur, "Qcur", il);
9371
9328
 
9372
- Kcur = ggml_rope_custom(
9373
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9329
+ Kcur = ggml_rope_ext(
9330
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9374
9331
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9375
9332
  );
9376
9333
  cb(Kcur, "Kcur", il);
@@ -9476,14 +9433,14 @@ struct llm_build_context {
9476
9433
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9477
9434
  cb(Vcur, "Vcur", il);
9478
9435
 
9479
- Qcur = ggml_rope_custom(
9480
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9436
+ Qcur = ggml_rope_ext(
9437
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9481
9438
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9482
9439
  ext_factor, attn_factor, beta_fast, beta_slow);
9483
9440
  cb(Qcur, "Qcur", il);
9484
9441
 
9485
- Kcur = ggml_rope_custom(
9486
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9442
+ Kcur = ggml_rope_ext(
9443
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9487
9444
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9488
9445
  ext_factor, attn_factor, beta_fast, beta_slow);
9489
9446
  cb(Kcur, "Kcur", il);
@@ -9684,15 +9641,15 @@ struct llm_build_context {
9684
9641
  cb(tmpk, "tmpk", il);
9685
9642
  cb(Vcur, "Vcur", il);
9686
9643
 
9687
- struct ggml_tensor * Qcur = ggml_rope_custom(
9688
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9644
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9645
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9689
9646
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9690
9647
  ext_factor, attn_factor, beta_fast, beta_slow
9691
9648
  );
9692
9649
  cb(Qcur, "Qcur", il);
9693
9650
 
9694
- struct ggml_tensor * Kcur = ggml_rope_custom(
9695
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9651
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9652
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9696
9653
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9697
9654
  ext_factor, attn_factor, beta_fast, beta_slow
9698
9655
  );
@@ -9800,15 +9757,15 @@ struct llm_build_context {
9800
9757
  // cb(Vcur, "Vcur", il);
9801
9758
  // }
9802
9759
 
9803
- Qcur = ggml_rope_custom(
9804
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9760
+ Qcur = ggml_rope_ext(
9761
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9805
9762
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9806
9763
  ext_factor, attn_factor, beta_fast, beta_slow
9807
9764
  );
9808
9765
  cb(Qcur, "Qcur", il);
9809
9766
 
9810
- Kcur = ggml_rope_custom(
9811
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9767
+ Kcur = ggml_rope_ext(
9768
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9812
9769
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9813
9770
  ext_factor, attn_factor, beta_fast, beta_slow
9814
9771
  );
@@ -9917,15 +9874,15 @@ struct llm_build_context {
9917
9874
  cb(Vcur, "Vcur", il);
9918
9875
  }
9919
9876
 
9920
- Qcur = ggml_rope_custom(
9921
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9877
+ Qcur = ggml_rope_ext(
9878
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9922
9879
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9923
9880
  ext_factor, attn_factor, beta_fast, beta_slow
9924
9881
  );
9925
9882
  cb(Qcur, "Qcur", il);
9926
9883
 
9927
- Kcur = ggml_rope_custom(
9928
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9884
+ Kcur = ggml_rope_ext(
9885
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9929
9886
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9930
9887
  ext_factor, attn_factor, beta_fast, beta_slow
9931
9888
  );
@@ -10047,15 +10004,15 @@ struct llm_build_context {
10047
10004
  cb(Vcur, "Vcur", il);
10048
10005
  }
10049
10006
 
10050
- Qcur = ggml_rope_custom(
10051
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10007
+ Qcur = ggml_rope_ext(
10008
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10052
10009
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10053
10010
  ext_factor, attn_factor, beta_fast, beta_slow
10054
10011
  );
10055
10012
  cb(Qcur, "Qcur", il);
10056
10013
 
10057
- Kcur = ggml_rope_custom(
10058
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10014
+ Kcur = ggml_rope_ext(
10015
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10059
10016
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10060
10017
  ext_factor, attn_factor, beta_fast, beta_slow
10061
10018
  );
@@ -10167,8 +10124,8 @@ struct llm_build_context {
10167
10124
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10168
10125
  cb(Vcur, "Vcur", il);
10169
10126
 
10170
- Qcur = ggml_rope_custom(
10171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10127
+ Qcur = ggml_rope_ext(
10128
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10172
10129
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10173
10130
  ext_factor, attn_factor, beta_fast, beta_slow);
10174
10131
  cb(Qcur, "Qcur", il);
@@ -10176,8 +10133,8 @@ struct llm_build_context {
10176
10133
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10177
10134
  cb(Qcur, "Qcur_scaled", il);
10178
10135
 
10179
- Kcur = ggml_rope_custom(
10180
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10136
+ Kcur = ggml_rope_ext(
10137
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10181
10138
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10182
10139
  ext_factor, attn_factor, beta_fast, beta_slow);
10183
10140
  cb(Kcur, "Kcur", il);
@@ -10287,15 +10244,15 @@ struct llm_build_context {
10287
10244
  cb(Vcur, "Vcur", il);
10288
10245
  }
10289
10246
 
10290
- Qcur = ggml_rope_custom(
10291
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10247
+ Qcur = ggml_rope_ext(
10248
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10292
10249
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10293
10250
  ext_factor, attn_factor, beta_fast, beta_slow
10294
10251
  );
10295
10252
  cb(Qcur, "Qcur", il);
10296
10253
 
10297
- Kcur = ggml_rope_custom(
10298
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10254
+ Kcur = ggml_rope_ext(
10255
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10299
10256
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10300
10257
  ext_factor, attn_factor, beta_fast, beta_slow
10301
10258
  );
@@ -10577,15 +10534,15 @@ struct llm_build_context {
10577
10534
  cb(Kcur, "Kcur", il);
10578
10535
  }
10579
10536
 
10580
- Qcur = ggml_rope_custom(
10581
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10537
+ Qcur = ggml_rope_ext(
10538
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10582
10539
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10583
10540
  ext_factor, attn_factor, beta_fast, beta_slow
10584
10541
  );
10585
10542
  cb(Qcur, "Qcur", il);
10586
10543
 
10587
- Kcur = ggml_rope_custom(
10588
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10544
+ Kcur = ggml_rope_ext(
10545
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10589
10546
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10590
10547
  ext_factor, attn_factor, beta_fast, beta_slow
10591
10548
  );
@@ -10708,15 +10665,15 @@ struct llm_build_context {
10708
10665
  cb(Vcur, "Vcur", il);
10709
10666
  }
10710
10667
 
10711
- Qcur = ggml_rope_custom(
10712
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10668
+ Qcur = ggml_rope_ext(
10669
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10713
10670
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10714
10671
  ext_factor, attn_factor, beta_fast, beta_slow
10715
10672
  );
10716
10673
  cb(Qcur, "Qcur", il);
10717
10674
 
10718
- Kcur = ggml_rope_custom(
10719
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10675
+ Kcur = ggml_rope_ext(
10676
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10720
10677
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10721
10678
  ext_factor, attn_factor, beta_fast, beta_slow
10722
10679
  );
@@ -10780,6 +10737,274 @@ struct llm_build_context {
10780
10737
 
10781
10738
  return gf;
10782
10739
  }
10740
+
10741
+ struct ggml_cgraph * build_gptneox() {
10742
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10743
+
10744
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10745
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10746
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10747
+
10748
+ struct ggml_tensor * cur;
10749
+ struct ggml_tensor * inpL;
10750
+
10751
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10752
+
10753
+ // inp_pos - contains the positions
10754
+ struct ggml_tensor * inp_pos = build_inp_pos();
10755
+
10756
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10757
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10758
+
10759
+ for (int il = 0; il < n_layer; ++il) {
10760
+ cur = llm_build_norm(ctx0, inpL, hparams,
10761
+ model.layers[il].attn_norm,
10762
+ model.layers[il].attn_norm_b,
10763
+ LLM_NORM, cb, il);
10764
+ cb(cur, "attn_norm", il);
10765
+
10766
+ // self-attention
10767
+ {
10768
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10769
+ cb(cur, "wqkv", il);
10770
+
10771
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10772
+ cb(cur, "bqkv", il);
10773
+
10774
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10775
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10776
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10777
+
10778
+ cb(Qcur, "Qcur", il);
10779
+ cb(Kcur, "Kcur", il);
10780
+ cb(Vcur, "Vcur", il);
10781
+
10782
+ Qcur = ggml_rope_ext(
10783
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10784
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10785
+ ext_factor, attn_factor, beta_fast, beta_slow
10786
+ );
10787
+ cb(Qcur, "Qcur", il);
10788
+
10789
+ Kcur = ggml_rope_ext(
10790
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10791
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10792
+ ext_factor, attn_factor, beta_fast, beta_slow
10793
+ );
10794
+ cb(Kcur, "Kcur", il);
10795
+
10796
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10797
+ model.layers[il].wo, model.layers[il].bo,
10798
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10799
+ }
10800
+
10801
+ if (il == n_layer - 1) {
10802
+ // skip computing output for unused tokens
10803
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10804
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10805
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10806
+ }
10807
+
10808
+ // ffn
10809
+ if (hparams.use_par_res) {
10810
+ // attention and ffn are computed in parallel
10811
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10812
+
10813
+ struct ggml_tensor * attn_out = cur;
10814
+
10815
+ cur = llm_build_norm(ctx0, inpL, hparams,
10816
+ model.layers[il].ffn_norm,
10817
+ model.layers[il].ffn_norm_b,
10818
+ LLM_NORM, cb, il);
10819
+ cb(cur, "ffn_norm", il);
10820
+
10821
+ cur = llm_build_ffn(ctx0, cur,
10822
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10823
+ NULL, NULL,
10824
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10825
+ NULL,
10826
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10827
+ cb(cur, "ffn_out", il);
10828
+
10829
+ cur = ggml_add(ctx0, cur, inpL);
10830
+ cb(cur, "ffn_out", il);
10831
+
10832
+ inpL = ggml_add(ctx0, cur, attn_out);
10833
+ cb(inpL, "l_out", il);
10834
+ } else {
10835
+ // attention and ffn are computed sequentially
10836
+ // x = x + attn(ln1(x))
10837
+ // x = x + ffn(ln2(x))
10838
+
10839
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10840
+ cb(ffn_inp, "ffn_inp", il);
10841
+
10842
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10843
+ model.layers[il].ffn_norm,
10844
+ model.layers[il].ffn_norm_b,
10845
+ LLM_NORM, cb, il);
10846
+ cb(cur, "ffn_norm", il);
10847
+
10848
+ cur = llm_build_ffn(ctx0, cur,
10849
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10850
+ NULL, NULL,
10851
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10852
+ NULL,
10853
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10854
+ cb(cur, "ffn_out", il);
10855
+
10856
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10857
+ cb(inpL, "l_out", il);
10858
+ }
10859
+ }
10860
+
10861
+ cur = llm_build_norm(ctx0, inpL, hparams,
10862
+ model.output_norm,
10863
+ model.output_norm_b,
10864
+ LLM_NORM, cb, -1);
10865
+ cb(cur, "result_norm", -1);
10866
+
10867
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10868
+ cb(cur, "result_output", -1);
10869
+
10870
+ ggml_build_forward_expand(gf, cur);
10871
+
10872
+ return gf;
10873
+ }
10874
+
10875
+ struct ggml_cgraph * build_arctic() {
10876
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10877
+
10878
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10879
+ int32_t n_tokens = this->n_tokens;
10880
+
10881
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10882
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10883
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10884
+
10885
+ struct ggml_tensor * cur;
10886
+ struct ggml_tensor * inpL;
10887
+
10888
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10889
+
10890
+ // inp_pos - contains the positions
10891
+ struct ggml_tensor * inp_pos = build_inp_pos();
10892
+
10893
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10894
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10895
+
10896
+ for (int il = 0; il < n_layer; ++il) {
10897
+ struct ggml_tensor * inpSA = inpL;
10898
+
10899
+ // norm
10900
+ cur = llm_build_norm(ctx0, inpL, hparams,
10901
+ model.layers[il].attn_norm, NULL,
10902
+ LLM_NORM_RMS, cb, il);
10903
+ cb(cur, "attn_norm", il);
10904
+
10905
+ // self-attention
10906
+ {
10907
+ // compute Q and K and RoPE them
10908
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10909
+ cb(Qcur, "Qcur", il);
10910
+
10911
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10912
+ cb(Kcur, "Kcur", il);
10913
+
10914
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10915
+ cb(Vcur, "Vcur", il);
10916
+
10917
+ Qcur = ggml_rope_ext(
10918
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10919
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10920
+ ext_factor, attn_factor, beta_fast, beta_slow
10921
+ );
10922
+ cb(Qcur, "Qcur", il);
10923
+
10924
+ Kcur = ggml_rope_ext(
10925
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10926
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10927
+ ext_factor, attn_factor, beta_fast, beta_slow
10928
+ );
10929
+ cb(Kcur, "Kcur", il);
10930
+
10931
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10932
+ model.layers[il].wo, NULL,
10933
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10934
+ }
10935
+
10936
+ if (il == n_layer - 1) {
10937
+ // skip computing output for unused tokens
10938
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10939
+ n_tokens = n_outputs;
10940
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10941
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10942
+ }
10943
+
10944
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10945
+ cb(ffn_inp, "ffn_inp", il);
10946
+
10947
+ // feed-forward network
10948
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10949
+ model.layers[il].ffn_norm, NULL,
10950
+ LLM_NORM_RMS, cb, il);
10951
+ cb(cur, "ffn_norm", il);
10952
+
10953
+ cur = llm_build_ffn(ctx0, cur,
10954
+ model.layers[il].ffn_up, NULL,
10955
+ model.layers[il].ffn_gate, NULL,
10956
+ model.layers[il].ffn_down, NULL,
10957
+ NULL,
10958
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10959
+ cb(cur, "ffn_out", il);
10960
+
10961
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
10962
+ cb(ffn_out, "ffn_out", il);
10963
+
10964
+ // MoE
10965
+ cur = llm_build_norm(ctx0, inpSA, hparams,
10966
+ model.layers[il].ffn_norm_exps, NULL,
10967
+ LLM_NORM_RMS, cb, il);
10968
+ cb(cur, "ffn_norm_exps", il);
10969
+
10970
+ cur = llm_build_moe_ffn(ctx0, cur,
10971
+ model.layers[il].ffn_gate_inp,
10972
+ model.layers[il].ffn_up_exps,
10973
+ model.layers[il].ffn_gate_exps,
10974
+ model.layers[il].ffn_down_exps,
10975
+ n_expert, n_expert_used,
10976
+ LLM_FFN_SILU, true,
10977
+ cb, il);
10978
+ cb(cur, "ffn_moe_out", il);
10979
+
10980
+ cur = ggml_add(ctx0, cur, ffn_out);
10981
+ cb(cur, "ffn_out", il);
10982
+
10983
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10984
+ if (layer_dir != nullptr) {
10985
+ cur = ggml_add(ctx0, cur, layer_dir);
10986
+ }
10987
+ cb(cur, "l_out", il);
10988
+
10989
+ // input for next layer
10990
+ inpL = cur;
10991
+ }
10992
+
10993
+ cur = inpL;
10994
+
10995
+ cur = llm_build_norm(ctx0, cur, hparams,
10996
+ model.output_norm, NULL,
10997
+ LLM_NORM_RMS, cb, -1);
10998
+ cb(cur, "result_norm", -1);
10999
+
11000
+ // lm_head
11001
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11002
+ cb(cur, "result_output", -1);
11003
+
11004
+ ggml_build_forward_expand(gf, cur);
11005
+
11006
+ return gf;
11007
+ }
10783
11008
  };
10784
11009
 
10785
11010
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -10896,10 +11121,6 @@ static struct ggml_cgraph * llama_build_graph(
10896
11121
  {
10897
11122
  result = llm.build_starcoder();
10898
11123
  } break;
10899
- case LLM_ARCH_PERSIMMON:
10900
- {
10901
- result = llm.build_persimmon();
10902
- } break;
10903
11124
  case LLM_ARCH_REFACT:
10904
11125
  {
10905
11126
  result = llm.build_refact();
@@ -10994,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
10994
11215
  {
10995
11216
  result = llm.build_olmo();
10996
11217
  } break;
11218
+ case LLM_ARCH_GPTNEOX:
11219
+ {
11220
+ result = llm.build_gptneox();
11221
+ } break;
11222
+ case LLM_ARCH_ARCTIC:
11223
+ {
11224
+ result = llm.build_arctic();
11225
+ } break;
10997
11226
  default:
10998
11227
  GGML_ASSERT(false);
10999
11228
  }
@@ -11339,11 +11568,6 @@ static void llama_graph_compute(
11339
11568
  llama_context & lctx,
11340
11569
  ggml_cgraph * gf,
11341
11570
  int n_threads) {
11342
- #ifdef GGML_USE_MPI
11343
- const int64_t n_layer = lctx.model.hparams.n_layer;
11344
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11345
- #endif
11346
-
11347
11571
  #ifdef GGML_USE_METAL
11348
11572
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11349
11573
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11358,10 +11582,6 @@ static void llama_graph_compute(
11358
11582
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11359
11583
 
11360
11584
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11361
-
11362
- #ifdef GGML_USE_MPI
11363
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11364
- #endif
11365
11585
  }
11366
11586
 
11367
11587
  // decode a batch of tokens by evaluating the transformer
@@ -11399,12 +11619,6 @@ static int llama_decode_internal(
11399
11619
  }
11400
11620
  lctx.n_queued_tokens += n_tokens_all;
11401
11621
 
11402
- #ifdef GGML_USE_MPI
11403
- // TODO: needs fix after #3228
11404
- GGML_ASSERT(false && "not implemented");
11405
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11406
- #endif
11407
-
11408
11622
  auto & kv_self = lctx.kv_self;
11409
11623
 
11410
11624
  const int64_t n_embd = hparams.n_embd;
@@ -12354,6 +12568,7 @@ struct llm_tokenizer_bpe {
12354
12568
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12355
12569
  });
12356
12570
  break;
12571
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12357
12572
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12358
12573
  word_collection = unicode_regex_split(text, {
12359
12574
  // original regex from tokenizer.json
@@ -12788,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12788
13003
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12789
13004
  // tokenizer.encode('', add_special_tokens=False) returns []
12790
13005
 
13006
+ static const bool rtrim = true; //TODO: as param
13007
+ bool is_prev_special = false;
13008
+ bool special_token_rtrim = false;
13009
+
12791
13010
  if (add_special && vocab.special_add_bos != 0) {
12792
13011
  GGML_ASSERT(vocab.special_bos_id != -1);
12793
13012
  output.push_back(vocab.special_bos_id);
13013
+ is_prev_special = true;
12794
13014
  }
12795
13015
 
12796
13016
  for (const auto & fragment : fragment_buffer) {
@@ -12802,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12802
13022
  // and passing 'add space prefix' as bool argument
12803
13023
  //
12804
13024
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12805
- if (&fragment == &fragment_buffer.front()) {
12806
- if (vocab.add_space_prefix) {
12807
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13025
+
13026
+ if (special_token_rtrim) {
13027
+ size_t num_whitespaces = 0;
13028
+ while (isspace(raw_text[num_whitespaces])) {
13029
+ num_whitespaces++;
13030
+ }
13031
+ if (num_whitespaces == raw_text.size()) {
13032
+ continue; // skip if all whitespaces
13033
+ }
13034
+ raw_text = raw_text.substr(num_whitespaces);
13035
+ }
13036
+
13037
+ if (vocab.add_space_prefix) {
13038
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13039
+ raw_text = " " + raw_text;
12808
13040
  }
12809
13041
  }
12810
13042
 
@@ -12816,6 +13048,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12816
13048
  tokenizer.tokenize(raw_text, output);
12817
13049
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12818
13050
  output.push_back(fragment.token);
13051
+ is_prev_special = true;
13052
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13053
+ special_token_rtrim = rtrim
13054
+ && fragment.token != vocab.special_bos_id
13055
+ && fragment.token != vocab.special_unk_id
13056
+ && fragment.token != vocab.special_eos_id;
12819
13057
  }
12820
13058
  }
12821
13059
 
@@ -14518,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14518
14756
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14519
14757
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14520
14758
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14521
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14522
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14523
14759
  if (qs.model.type == MODEL_70B) {
14524
14760
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14525
14761
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15533,10 +15769,6 @@ void llama_backend_init(void) {
15533
15769
  struct ggml_context * ctx = ggml_init(params);
15534
15770
  ggml_free(ctx);
15535
15771
  }
15536
-
15537
- #ifdef GGML_USE_MPI
15538
- ggml_mpi_backend_init();
15539
- #endif
15540
15772
  }
15541
15773
 
15542
15774
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15546,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15546
15778
  }
15547
15779
 
15548
15780
  void llama_backend_free(void) {
15549
- #ifdef GGML_USE_MPI
15550
- ggml_mpi_backend_free();
15551
- #endif
15552
15781
  ggml_quantize_free();
15553
15782
  }
15554
15783
 
@@ -15691,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
15691
15920
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15692
15921
  }
15693
15922
 
15923
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15694
15924
  cparams.causal_attn = hparams.causal_attn;
15695
15925
 
15696
15926
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15949,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
15949
16179
  }
15950
16180
  }
15951
16181
 
15952
- #ifdef GGML_USE_MPI
15953
- ctx->ctx_mpi = ggml_mpi_init();
15954
-
15955
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15956
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15957
- // TODO: needs fix after #3228
15958
- GGML_ASSERT(false && "not implemented");
15959
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15960
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15961
- llama_backend_free();
15962
- exit(1);
15963
- }
15964
- #endif
15965
-
15966
16182
  return ctx;
15967
16183
  }
15968
16184
 
@@ -15999,7 +16215,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15999
16215
  // these models do not use RoPE
16000
16216
  case LLM_ARCH_GPT2:
16001
16217
  case LLM_ARCH_GPTJ:
16002
- case LLM_ARCH_GPTNEOX:
16003
16218
  case LLM_ARCH_MPT:
16004
16219
  case LLM_ARCH_REFACT:
16005
16220
  case LLM_ARCH_BLOOM:
@@ -16019,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16019
16234
  case LLM_ARCH_XVERSE:
16020
16235
  case LLM_ARCH_COMMAND_R:
16021
16236
  case LLM_ARCH_OLMO:
16237
+ case LLM_ARCH_ARCTIC:
16022
16238
  return LLAMA_ROPE_TYPE_NORM;
16023
16239
 
16024
16240
  // the pairs of head values are offset by n_rot/2
16025
16241
  case LLM_ARCH_FALCON:
16026
16242
  case LLM_ARCH_GROK:
16027
16243
  case LLM_ARCH_DBRX:
16028
- case LLM_ARCH_PERSIMMON:
16029
16244
  case LLM_ARCH_BERT:
16030
16245
  case LLM_ARCH_NOMIC_BERT:
16031
16246
  case LLM_ARCH_STABLELM:
@@ -16036,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16036
16251
  case LLM_ARCH_PHI3:
16037
16252
  case LLM_ARCH_GEMMA:
16038
16253
  case LLM_ARCH_STARCODER2:
16254
+ case LLM_ARCH_GPTNEOX:
16039
16255
  return LLAMA_ROPE_TYPE_NEOX;
16040
16256
 
16041
16257
  // all model arches should be listed explicitly here
@@ -16195,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16195
16411
  }
16196
16412
 
16197
16413
  // make tensors
16414
+ cvec.tensors.reserve(model.hparams.n_layer);
16198
16415
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16199
16416
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16200
16417
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16203,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16203
16420
  }
16204
16421
 
16205
16422
  // allocate tensors / buffers and zero
16423
+ cvec.ctxs.reserve(ctx_map.size());
16424
+ cvec.bufs.reserve(ctx_map.size());
16206
16425
  for (auto it : ctx_map) {
16207
16426
  ggml_backend_buffer_type_t buft = it.first;
16208
16427
  ggml_context * ctx = it.second;
@@ -17411,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17411
17630
  ctx->cparams.n_threads_batch = n_threads_batch;
17412
17631
  }
17413
17632
 
17633
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17634
+ return ctx->cparams.n_threads;
17635
+ }
17636
+
17637
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17638
+ return ctx->cparams.n_threads_batch;
17639
+ }
17640
+
17414
17641
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17415
17642
  ctx->abort_callback = abort_callback;
17416
17643
  ctx->abort_callback_data = abort_callback_data;
@@ -17845,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
17845
18072
  }
17846
18073
  }
17847
18074
  // llama2 templates seem to not care about "add_generation_prompt"
18075
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18076
+ // Phi 3
18077
+ for (auto message : chat) {
18078
+ std::string role(message->role);
18079
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18080
+ }
18081
+ if (add_ass) {
18082
+ ss << "<|assistant|>\n";
18083
+ }
17848
18084
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17849
18085
  // zephyr template
17850
18086
  for (auto message : chat) {
@@ -17977,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
17977
18213
  if (add_ass) {
17978
18214
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17979
18215
  }
17980
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17981
- // Phi 3
17982
- for (auto message : chat) {
17983
- std::string role(message->role);
17984
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17985
- }
17986
- if (add_ass) {
17987
- ss << "<|assistant|>\n";
17988
- }
17989
18216
  } else {
17990
18217
  // template not supported
17991
18218
  return -1;
@@ -18107,6 +18334,7 @@ const char * llama_print_system_info(void) {
18107
18334
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
18108
18335
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
18109
18336
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18337
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18110
18338
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18111
18339
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18112
18340
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
@@ -18167,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
18167
18395
  g_state.log_callback_user_data = user_data;
18168
18396
  #ifdef GGML_USE_METAL
18169
18397
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18398
+ #elif defined(GGML_USE_CUDA)
18399
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18170
18400
  #endif
18171
18401
  }
18172
18402