llama_cpp 0.15.2 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,16 +26,9 @@
26
26
  #ifdef GGML_USE_METAL
27
27
  # include "ggml-metal.h"
28
28
  #endif
29
- #ifdef GGML_USE_MPI
30
- # include "ggml-mpi.h"
31
- #endif
32
- #ifndef QK_K
33
- # ifdef GGML_QKK_64
34
- # define QK_K 64
35
- # else
36
- # define QK_K 256
37
- # endif
38
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
39
32
 
40
33
  #ifdef __has_include
41
34
  #if __has_include(<unistd.h>)
@@ -110,7 +103,7 @@
110
103
  #endif
111
104
 
112
105
  #define LLAMA_MAX_NODES 8192
113
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 128
114
107
 
115
108
  //
116
109
  // logging
@@ -205,7 +198,6 @@ enum llm_arch {
205
198
  LLM_ARCH_GPTNEOX,
206
199
  LLM_ARCH_MPT,
207
200
  LLM_ARCH_STARCODER,
208
- LLM_ARCH_PERSIMMON,
209
201
  LLM_ARCH_REFACT,
210
202
  LLM_ARCH_BERT,
211
203
  LLM_ARCH_NOMIC_BERT,
@@ -229,6 +221,7 @@ enum llm_arch {
229
221
  LLM_ARCH_COMMAND_R,
230
222
  LLM_ARCH_DBRX,
231
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
232
225
  LLM_ARCH_UNKNOWN,
233
226
  };
234
227
 
@@ -242,7 +235,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
242
235
  { LLM_ARCH_MPT, "mpt" },
243
236
  { LLM_ARCH_BAICHUAN, "baichuan" },
244
237
  { LLM_ARCH_STARCODER, "starcoder" },
245
- { LLM_ARCH_PERSIMMON, "persimmon" },
246
238
  { LLM_ARCH_REFACT, "refact" },
247
239
  { LLM_ARCH_BERT, "bert" },
248
240
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
@@ -266,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
258
  { LLM_ARCH_COMMAND_R, "command-r" },
267
259
  { LLM_ARCH_DBRX, "dbrx" },
268
260
  { LLM_ARCH_OLMO, "olmo" },
261
+ { LLM_ARCH_ARCTIC, "arctic" },
269
262
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
263
  };
271
264
 
@@ -309,6 +302,7 @@ enum llm_kv {
309
302
  LLM_KV_ROPE_SCALE_LINEAR,
310
303
  LLM_KV_ROPE_SCALING_TYPE,
311
304
  LLM_KV_ROPE_SCALING_FACTOR,
305
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
312
306
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
313
307
  LLM_KV_ROPE_SCALING_FINETUNED,
314
308
 
@@ -386,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
380
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
387
381
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
388
382
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
383
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
389
384
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
390
385
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
391
386
 
@@ -441,6 +436,8 @@ enum llm_tensor {
441
436
  LLM_TENSOR_OUTPUT,
442
437
  LLM_TENSOR_OUTPUT_NORM,
443
438
  LLM_TENSOR_ROPE_FREQS,
439
+ LLM_TENSOR_ROPE_FACTORS_LONG,
440
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
444
441
  LLM_TENSOR_ATTN_Q,
445
442
  LLM_TENSOR_ATTN_K,
446
443
  LLM_TENSOR_ATTN_V,
@@ -460,6 +457,7 @@ enum llm_tensor {
460
457
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
461
458
  LLM_TENSOR_FFN_GATE_EXP,
462
459
  LLM_TENSOR_FFN_UP_EXP,
460
+ LLM_TENSOR_FFN_NORM_EXPS,
463
461
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
464
462
  LLM_TENSOR_FFN_GATE_EXPS,
465
463
  LLM_TENSOR_FFN_UP_EXPS,
@@ -598,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
598
596
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
599
597
  },
600
598
  },
601
- {
602
- LLM_ARCH_PERSIMMON,
603
- {
604
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
605
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
606
- { LLM_TENSOR_OUTPUT, "output"},
607
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
608
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
609
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
610
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
611
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
612
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
613
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
614
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
615
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
616
- },
617
- },
618
599
  {
619
600
  LLM_ARCH_MPT,
620
601
  {
@@ -825,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
825
806
  {
826
807
  LLM_ARCH_PHI3,
827
808
  {
828
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
- { LLM_TENSOR_OUTPUT, "output" },
831
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
833
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
834
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
835
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
836
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
837
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
838
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
839
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
809
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
810
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
811
+ { LLM_TENSOR_OUTPUT, "output" },
812
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
813
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
816
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
817
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
840
823
  },
841
824
  },
842
825
  {
@@ -1052,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1052
1035
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1053
1036
  },
1054
1037
  },
1038
+ {
1039
+ LLM_ARCH_ARCTIC,
1040
+ {
1041
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1042
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1043
+ { LLM_TENSOR_OUTPUT, "output" },
1044
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1050
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1051
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1052
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1053
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1054
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1055
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1056
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1057
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
+ },
1059
+ },
1055
1060
  {
1056
1061
  LLM_ARCH_UNKNOWN,
1057
1062
  {
@@ -1697,6 +1702,8 @@ struct llama_state {
1697
1702
  llama_state() {
1698
1703
  #ifdef GGML_USE_METAL
1699
1704
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1705
+ #elif defined(GGML_USE_CUDA)
1706
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1700
1707
  #endif
1701
1708
  }
1702
1709
 
@@ -1710,17 +1717,24 @@ static llama_state g_state;
1710
1717
  // available llama models
1711
1718
  enum e_model {
1712
1719
  MODEL_UNKNOWN,
1720
+ MODEL_14M,
1713
1721
  MODEL_17M,
1714
1722
  MODEL_22M,
1715
1723
  MODEL_33M,
1724
+ MODEL_70M,
1716
1725
  MODEL_109M,
1717
1726
  MODEL_137M,
1727
+ MODEL_160M,
1718
1728
  MODEL_335M,
1729
+ MODEL_410M,
1719
1730
  MODEL_0_5B,
1720
1731
  MODEL_1B,
1732
+ MODEL_1_4B,
1721
1733
  MODEL_2B,
1734
+ MODEL_2_8B,
1722
1735
  MODEL_3B,
1723
1736
  MODEL_4B,
1737
+ MODEL_6_9B,
1724
1738
  MODEL_7B,
1725
1739
  MODEL_8B,
1726
1740
  MODEL_12B,
@@ -1743,6 +1757,7 @@ enum e_model {
1743
1757
  MODEL_8x7B,
1744
1758
  MODEL_8x22B,
1745
1759
  MODEL_16x12B,
1760
+ MODEL_10B_128x3_66B,
1746
1761
  };
1747
1762
 
1748
1763
  static const size_t kiB = 1024;
@@ -1752,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
1752
1767
  struct llama_hparams {
1753
1768
  bool vocab_only;
1754
1769
  bool rope_finetuned;
1770
+ bool use_par_res;
1755
1771
 
1756
1772
  uint32_t n_vocab;
1757
1773
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1770,6 +1786,7 @@ struct llama_hparams {
1770
1786
  float f_norm_eps;
1771
1787
  float f_norm_rms_eps;
1772
1788
 
1789
+ float rope_attn_factor = 1.0f;
1773
1790
  float rope_freq_base_train;
1774
1791
  float rope_freq_scale_train;
1775
1792
  uint32_t n_yarn_orig_ctx;
@@ -1818,6 +1835,7 @@ struct llama_hparams {
1818
1835
 
1819
1836
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1820
1837
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1838
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1821
1839
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1822
1840
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1823
1841
 
@@ -1915,6 +1933,7 @@ struct llama_layer {
1915
1933
  struct ggml_tensor * ffn_norm_b;
1916
1934
  struct ggml_tensor * layer_out_norm;
1917
1935
  struct ggml_tensor * layer_out_norm_b;
1936
+ struct ggml_tensor * ffn_norm_exps;
1918
1937
 
1919
1938
  // ff
1920
1939
  struct ggml_tensor * ffn_gate; // w1
@@ -1952,6 +1971,10 @@ struct llama_layer {
1952
1971
  // mamba bias
1953
1972
  struct ggml_tensor * ssm_conv1d_b;
1954
1973
  struct ggml_tensor * ssm_dt_b;
1974
+
1975
+ // long rope factors
1976
+ struct ggml_tensor * rope_long = nullptr;
1977
+ struct ggml_tensor * rope_short = nullptr;
1955
1978
  };
1956
1979
 
1957
1980
  struct llama_kv_cell {
@@ -2268,10 +2291,6 @@ struct llama_context {
2268
2291
 
2269
2292
  // control vectors
2270
2293
  struct llama_control_vector cvec;
2271
-
2272
- #ifdef GGML_USE_MPI
2273
- ggml_mpi_context * ctx_mpi = NULL;
2274
- #endif
2275
2294
  };
2276
2295
 
2277
2296
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
@@ -2491,7 +2510,6 @@ static bool llama_kv_cache_init(
2491
2510
  static bool llama_kv_cache_find_slot(
2492
2511
  struct llama_kv_cache & cache,
2493
2512
  const struct llama_batch & batch) {
2494
- const uint32_t n_ctx = cache.size;
2495
2513
  const uint32_t n_tokens = batch.n_tokens;
2496
2514
 
2497
2515
  if (cache.recurrent) {
@@ -2542,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
2542
2560
  }
2543
2561
  // otherwise, one cell per token.
2544
2562
 
2545
- if (n_tokens > n_ctx) {
2546
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2563
+ if (n_tokens > cache.size) {
2564
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2547
2565
  return false;
2548
2566
  }
2549
2567
 
2550
2568
  uint32_t n_tested = 0;
2551
2569
 
2552
2570
  while (true) {
2553
- if (cache.head + n_tokens > n_ctx) {
2554
- n_tested += n_ctx - cache.head;
2571
+ if (cache.head + n_tokens > cache.size) {
2572
+ n_tested += cache.size - cache.head;
2555
2573
  cache.head = 0;
2556
2574
  continue;
2557
2575
  }
@@ -2570,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
2570
2588
  break;
2571
2589
  }
2572
2590
 
2573
- if (n_tested >= n_ctx) {
2591
+ if (n_tested >= cache.size) {
2574
2592
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2575
2593
  return false;
2576
2594
  }
@@ -3330,6 +3348,39 @@ struct llama_model_loader {
3330
3348
  return get_arr_n(llm_kv(kid), result, required);
3331
3349
  }
3332
3350
 
3351
+ template<typename T>
3352
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3353
+ const int kid = gguf_find_key(meta, key.c_str());
3354
+
3355
+ if (kid < 0) {
3356
+ if (required) {
3357
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3358
+ }
3359
+ return false;
3360
+ }
3361
+
3362
+ struct GGUFMeta::ArrayInfo arr_info =
3363
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3364
+
3365
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3366
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3367
+ }
3368
+
3369
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3370
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3371
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3372
+
3373
+ result.resize(arr_info.length);
3374
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3375
+
3376
+ return true;
3377
+ }
3378
+
3379
+ template<typename T>
3380
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3381
+ return get_arr(llm_kv(kid), result, required);
3382
+ }
3383
+
3333
3384
  template<typename T>
3334
3385
  bool get_key(const std::string & key, T & result, const bool required = true) {
3335
3386
  auto it = kv_overrides.find(key);
@@ -3404,11 +3455,15 @@ struct llama_model_loader {
3404
3455
  return get_tensor_meta(get_tensor_name(i));
3405
3456
  }
3406
3457
 
3407
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3458
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3408
3459
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3409
3460
  ggml_set_name(tensor, ggml_get_name(cur));
3410
3461
 
3411
- n_created++;
3462
+ if (duplicated) {
3463
+ size_data += ggml_nbytes(cur);
3464
+ } else {
3465
+ n_created++;
3466
+ }
3412
3467
 
3413
3468
  return tensor;
3414
3469
  }
@@ -3443,14 +3498,17 @@ struct llama_model_loader {
3443
3498
  return cur;
3444
3499
  }
3445
3500
 
3446
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3447
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3501
+ static const int TENSOR_NOT_REQUIRED = 1;
3502
+ static const int TENSOR_DUPLICATED = 2;
3503
+
3504
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3505
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3448
3506
 
3449
3507
  if (cur == NULL) {
3450
3508
  return NULL;
3451
3509
  }
3452
3510
 
3453
- return create_tensor_for(ctx, cur);
3511
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3454
3512
  }
3455
3513
 
3456
3514
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3750,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3750
3808
 
3751
3809
  static const char * llama_model_type_name(e_model type) {
3752
3810
  switch (type) {
3753
- case MODEL_22M: return "22M";
3754
- case MODEL_33M: return "33M";
3755
- case MODEL_109M: return "109M";
3756
- case MODEL_137M: return "137M";
3757
- case MODEL_0_5B: return "0.5B";
3758
- case MODEL_1B: return "1B";
3759
- case MODEL_2B: return "2B";
3760
- case MODEL_3B: return "3B";
3761
- case MODEL_7B: return "7B";
3762
- case MODEL_8B: return "8B";
3763
- case MODEL_12B: return "12B";
3764
- case MODEL_13B: return "13B";
3765
- case MODEL_14B: return "14B";
3766
- case MODEL_15B: return "15B";
3767
- case MODEL_20B: return "20B";
3768
- case MODEL_30B: return "30B";
3769
- case MODEL_34B: return "34B";
3770
- case MODEL_35B: return "35B";
3771
- case MODEL_40B: return "40B";
3772
- case MODEL_65B: return "65B";
3773
- case MODEL_70B: return "70B";
3774
- case MODEL_314B: return "314B";
3775
- case MODEL_SMALL: return "0.1B";
3776
- case MODEL_MEDIUM: return "0.4B";
3777
- case MODEL_LARGE: return "0.8B";
3778
- case MODEL_XL: return "1.5B";
3779
- case MODEL_A2_7B: return "A2.7B";
3780
- case MODEL_8x7B: return "8x7B";
3781
- case MODEL_8x22B: return "8x22B";
3782
- case MODEL_16x12B: return "16x12B";
3783
- default: return "?B";
3811
+ case MODEL_14M: return "14M";
3812
+ case MODEL_17M: return "17M";
3813
+ case MODEL_22M: return "22M";
3814
+ case MODEL_33M: return "33M";
3815
+ case MODEL_70M: return "70M";
3816
+ case MODEL_109M: return "109M";
3817
+ case MODEL_137M: return "137M";
3818
+ case MODEL_160M: return "160M";
3819
+ case MODEL_335M: return "335M";
3820
+ case MODEL_410M: return "410M";
3821
+ case MODEL_0_5B: return "0.5B";
3822
+ case MODEL_1B: return "1B";
3823
+ case MODEL_1_4B: return "1.4B";
3824
+ case MODEL_2B: return "2B";
3825
+ case MODEL_2_8B: return "2.8B";
3826
+ case MODEL_3B: return "3B";
3827
+ case MODEL_4B: return "4B";
3828
+ case MODEL_6_9B: return "6.9B";
3829
+ case MODEL_7B: return "7B";
3830
+ case MODEL_8B: return "8B";
3831
+ case MODEL_12B: return "12B";
3832
+ case MODEL_13B: return "13B";
3833
+ case MODEL_14B: return "14B";
3834
+ case MODEL_15B: return "15B";
3835
+ case MODEL_20B: return "20B";
3836
+ case MODEL_30B: return "30B";
3837
+ case MODEL_34B: return "34B";
3838
+ case MODEL_35B: return "35B";
3839
+ case MODEL_40B: return "40B";
3840
+ case MODEL_65B: return "65B";
3841
+ case MODEL_70B: return "70B";
3842
+ case MODEL_314B: return "314B";
3843
+ case MODEL_SMALL: return "0.1B";
3844
+ case MODEL_MEDIUM: return "0.4B";
3845
+ case MODEL_LARGE: return "0.8B";
3846
+ case MODEL_XL: return "1.5B";
3847
+ case MODEL_A2_7B: return "A2.7B";
3848
+ case MODEL_8x7B: return "8x7B";
3849
+ case MODEL_8x22B: return "8x22B";
3850
+ case MODEL_16x12B: return "16x12B";
3851
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3852
+ default: return "?B";
3784
3853
  }
3785
3854
  }
3786
3855
 
@@ -3873,6 +3942,8 @@ static void llm_load_hparams(
3873
3942
  }
3874
3943
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3875
3944
 
3945
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
3946
+
3876
3947
  // sanity check for n_rot (optional)
3877
3948
  {
3878
3949
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3972,14 +4043,6 @@ static void llm_load_hparams(
3972
4043
  default: model.type = e_model::MODEL_UNKNOWN;
3973
4044
  }
3974
4045
  } break;
3975
- case LLM_ARCH_PERSIMMON:
3976
- {
3977
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3978
- switch (hparams.n_layer) {
3979
- case 36: model.type = e_model::MODEL_8B; break;
3980
- default: model.type = e_model::MODEL_UNKNOWN;
3981
- }
3982
- } break;
3983
4046
  case LLM_ARCH_REFACT:
3984
4047
  {
3985
4048
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -4121,6 +4184,7 @@ static void llm_load_hparams(
4121
4184
  switch (hparams.n_layer) {
4122
4185
  case 24: model.type = e_model::MODEL_1B; break;
4123
4186
  case 32: model.type = e_model::MODEL_3B; break;
4187
+ case 40: model.type = e_model::MODEL_14B; break;
4124
4188
  default: model.type = e_model::MODEL_UNKNOWN;
4125
4189
  }
4126
4190
  } break;
@@ -4261,6 +4325,65 @@ static void llm_load_hparams(
4261
4325
  default: model.type = e_model::MODEL_UNKNOWN;
4262
4326
  }
4263
4327
  } break;
4328
+ case LLM_ARCH_GPTNEOX:
4329
+ {
4330
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4331
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4332
+ switch (hparams.n_layer) {
4333
+ case 6:
4334
+ switch (hparams.n_ff) {
4335
+ case 512: model.type = e_model::MODEL_14M; break;
4336
+ case 2048: model.type = e_model::MODEL_70M; break;
4337
+ default: model.type = e_model::MODEL_UNKNOWN;
4338
+ } break;
4339
+ case 12:
4340
+ switch (hparams.n_ff) {
4341
+ case 3072: model.type = e_model::MODEL_160M; break;
4342
+ default: model.type = e_model::MODEL_UNKNOWN;
4343
+ } break;
4344
+ case 16:
4345
+ switch (hparams.n_ff) {
4346
+ case 8192: model.type = e_model::MODEL_1B; break;
4347
+ default: model.type = e_model::MODEL_UNKNOWN;
4348
+ } break;
4349
+ case 24:
4350
+ switch (hparams.n_ff) {
4351
+ case 4096: model.type = e_model::MODEL_410M; break;
4352
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4353
+ default: model.type = e_model::MODEL_UNKNOWN;
4354
+ } break;
4355
+ case 32:
4356
+ switch (hparams.n_ff) {
4357
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4358
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4359
+ default: model.type = e_model::MODEL_UNKNOWN;
4360
+ } break;
4361
+ case 36:
4362
+ switch (hparams.n_ff) {
4363
+ case 20480: model.type = e_model::MODEL_12B; break;
4364
+ default: model.type = e_model::MODEL_UNKNOWN;
4365
+ } break;
4366
+ case 44:
4367
+ switch (hparams.n_ff) {
4368
+ case 24576: model.type = e_model::MODEL_20B; break;
4369
+ default: model.type = e_model::MODEL_UNKNOWN;
4370
+ } break;
4371
+ default: model.type = e_model::MODEL_UNKNOWN;
4372
+ }
4373
+ } break;
4374
+ case LLM_ARCH_ARCTIC:
4375
+ {
4376
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4377
+
4378
+ if (hparams.n_expert == 128) {
4379
+ switch (hparams.n_layer) {
4380
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4381
+ default: model.type = e_model::MODEL_UNKNOWN;
4382
+ }
4383
+ } else {
4384
+ model.type = e_model::MODEL_UNKNOWN;
4385
+ }
4386
+ } break;
4264
4387
  default: (void)0;
4265
4388
  }
4266
4389
 
@@ -4461,6 +4584,9 @@ static void llm_load_vocab(
4461
4584
  } else if (
4462
4585
  tokenizer_pre == "qwen2") {
4463
4586
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4587
+ } else if (
4588
+ tokenizer_pre == "stablelm2") {
4589
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4464
4590
  } else if (
4465
4591
  tokenizer_pre == "olmo") {
4466
4592
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -4582,7 +4708,8 @@ static void llm_load_vocab(
4582
4708
  (t.first == "<|eot_id|>" ||
4583
4709
  t.first == "<|im_end|>" ||
4584
4710
  t.first == "<|end|>" ||
4585
- t.first == "<end_of_turn>"
4711
+ t.first == "<end_of_turn>" ||
4712
+ t.first == "<|endoftext|>"
4586
4713
  )
4587
4714
  ) {
4588
4715
  vocab.special_eot_id = t.second;
@@ -4908,6 +5035,7 @@ static bool llm_load_tensors(
4908
5035
  // create tensors for the weights
4909
5036
  {
4910
5037
  const int64_t n_embd = hparams.n_embd;
5038
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4911
5039
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4912
5040
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4913
5041
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4942,12 +5070,10 @@ static bool llm_load_tensors(
4942
5070
  {
4943
5071
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4944
5072
  if (model.arch != LLM_ARCH_MINICPM){
4945
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5073
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4946
5074
  // if output is NULL, init from the input tok embed
4947
5075
  if (model.output == NULL) {
4948
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4949
- ml.n_created--; // artificial tensor
4950
- ml.size_data += ggml_nbytes(model.output);
5076
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4951
5077
  }
4952
5078
  }
4953
5079
  }
@@ -4966,10 +5092,10 @@ static bool llm_load_tensors(
4966
5092
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4967
5093
 
4968
5094
  // optional bias tensors
4969
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4970
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4971
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4972
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5095
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5096
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5097
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5098
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4973
5099
 
4974
5100
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4975
5101
 
@@ -4980,7 +5106,7 @@ static bool llm_load_tensors(
4980
5106
  } else {
4981
5107
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4982
5108
 
4983
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5109
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4984
5110
  if (layer.ffn_gate_exps) {
4985
5111
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4986
5112
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5022,12 +5148,10 @@ static bool llm_load_tensors(
5022
5148
  // output
5023
5149
  {
5024
5150
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5025
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5151
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5026
5152
  // if output is NULL, init from the input tok embed
5027
5153
  if (model.output == NULL) {
5028
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5029
- ml.n_created--; // artificial tensor
5030
- ml.size_data += ggml_nbytes(model.output);
5154
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5031
5155
  }
5032
5156
  }
5033
5157
 
@@ -5050,7 +5174,7 @@ static bool llm_load_tensors(
5050
5174
 
5051
5175
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5052
5176
 
5053
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5177
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
5054
5178
  if (layer.ffn_gate_exps) {
5055
5179
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
5056
5180
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5152,11 +5276,9 @@ static bool llm_load_tensors(
5152
5276
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5153
5277
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5154
5278
 
5155
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5279
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
5280
  if (!model.output) {
5157
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5158
- ml.n_created--; // artificial tensor
5159
- ml.size_data += ggml_nbytes(model.output);
5281
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5160
5282
  }
5161
5283
  }
5162
5284
 
@@ -5169,8 +5291,8 @@ static bool llm_load_tensors(
5169
5291
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5170
5292
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5171
5293
 
5172
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5173
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5294
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5295
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5174
5296
 
5175
5297
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5176
5298
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5188,7 +5310,12 @@ static bool llm_load_tensors(
5188
5310
  {
5189
5311
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5190
5312
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5191
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5313
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5314
+ if (!model.output) {
5315
+ // needs to be on GPU
5316
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5317
+ }
5318
+
5192
5319
  }
5193
5320
 
5194
5321
  for (int i = 0; i < n_layer; ++i) {
@@ -5216,47 +5343,6 @@ static bool llm_load_tensors(
5216
5343
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5217
5344
  }
5218
5345
  } break;
5219
- case LLM_ARCH_PERSIMMON:
5220
- {
5221
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5222
-
5223
- {
5224
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5225
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5226
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5227
- }
5228
-
5229
- for (int i = 0; i < n_layer; ++i) {
5230
- ggml_context * ctx_layer = ctx_for_layer(i);
5231
- ggml_context * ctx_split = ctx_for_layer_split(i);
5232
-
5233
- auto & layer = model.layers[i];
5234
-
5235
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5236
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5237
-
5238
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5239
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5240
-
5241
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5242
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5243
-
5244
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5245
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5246
-
5247
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5248
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5249
-
5250
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5251
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5252
-
5253
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5254
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5255
-
5256
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5257
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5258
- }
5259
- } break;
5260
5346
  case LLM_ARCH_BERT:
5261
5347
  case LLM_ARCH_NOMIC_BERT:
5262
5348
  {
@@ -5325,14 +5411,14 @@ static bool llm_load_tensors(
5325
5411
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
5412
  layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
5413
 
5328
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5414
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5416
 
5331
5417
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
5418
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
5419
 
5334
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5420
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5421
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5336
5422
 
5337
5423
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
5424
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
@@ -5394,18 +5480,16 @@ static bool llm_load_tensors(
5394
5480
  case LLM_ARCH_MPT:
5395
5481
  {
5396
5482
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5397
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5483
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5398
5484
 
5399
5485
  // output
5400
5486
  {
5401
5487
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5402
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5488
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5403
5489
 
5404
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5490
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5405
5491
  if (!model.output) {
5406
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5407
- ml.n_created--; // artificial tensor
5408
- ml.size_data += ggml_nbytes(model.output);
5492
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5409
5493
  }
5410
5494
  }
5411
5495
 
@@ -5416,31 +5500,31 @@ static bool llm_load_tensors(
5416
5500
  auto & layer = model.layers[i];
5417
5501
 
5418
5502
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5419
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5503
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5420
5504
 
5421
5505
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5422
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5506
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5423
5507
 
5424
5508
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5425
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5509
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5426
5510
 
5427
5511
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5428
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5512
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5429
5513
 
5430
5514
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5431
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5515
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5432
5516
 
5433
5517
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5434
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5518
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5435
5519
 
5436
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5437
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5520
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5521
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5438
5522
 
5439
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5440
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5523
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5524
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5441
5525
 
5442
5526
  // AWQ ScaleActivation layer
5443
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5527
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5444
5528
  }
5445
5529
  } break;
5446
5530
  case LLM_ARCH_STABLELM:
@@ -5469,17 +5553,17 @@ static bool llm_load_tensors(
5469
5553
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5470
5554
 
5471
5555
  // optional bias tensors, present in Stable LM 2 1.6B
5472
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5473
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5474
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5556
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5557
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5558
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5475
5559
 
5476
5560
  // optional q and k layernorms, present in StableLM 2 12B
5477
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5478
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5561
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5562
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5479
5563
 
5480
5564
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5481
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5482
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5565
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5566
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5483
5567
 
5484
5568
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5485
5569
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5522,12 +5606,10 @@ static bool llm_load_tensors(
5522
5606
  // output
5523
5607
  {
5524
5608
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5525
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5609
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5526
5610
  // if output is NULL, init from the input tok embed
5527
5611
  if (model.output == NULL) {
5528
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5529
- ml.n_created--; // artificial tensor
5530
- ml.size_data += ggml_nbytes(model.output);
5612
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5531
5613
  }
5532
5614
  }
5533
5615
 
@@ -5625,8 +5707,8 @@ static bool llm_load_tensors(
5625
5707
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5626
5708
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5627
5709
 
5628
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5629
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5710
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5711
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5630
5712
 
5631
5713
  if (layer.wqkv == nullptr) {
5632
5714
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5663,17 +5745,20 @@ static bool llm_load_tensors(
5663
5745
  ggml_context* ctx_layer = ctx_for_layer(i);
5664
5746
  ggml_context* ctx_split = ctx_for_layer_split(i);
5665
5747
 
5666
- auto& layer = model.layers[i];
5748
+ auto & layer = model.layers[i];
5667
5749
 
5668
5750
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5669
5751
 
5670
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5671
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5752
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5753
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5672
5754
 
5673
5755
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5674
5756
 
5675
5757
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5676
5758
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5759
+
5760
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5761
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5677
5762
  }
5678
5763
  } break;
5679
5764
  case LLM_ARCH_PLAMO:
@@ -5842,9 +5927,7 @@ static bool llm_load_tensors(
5842
5927
 
5843
5928
  // output
5844
5929
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5845
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5846
- ml.n_created--; // artificial tensor
5847
- ml.size_data += ggml_nbytes(model.output);
5930
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5848
5931
 
5849
5932
  const int64_t n_ff = hparams.n_ff;
5850
5933
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5879,12 +5962,10 @@ static bool llm_load_tensors(
5879
5962
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5880
5963
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5881
5964
 
5882
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5965
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5883
5966
  // if output is NULL, init from the input tok embed
5884
5967
  if (model.output == NULL) {
5885
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5886
- ml.n_created--; // artificial tensor
5887
- ml.size_data += ggml_nbytes(model.output);
5968
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5888
5969
  }
5889
5970
 
5890
5971
  }
@@ -5935,12 +6016,10 @@ static bool llm_load_tensors(
5935
6016
  {
5936
6017
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5937
6018
 
5938
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6019
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5939
6020
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5940
6021
  if (model.output == NULL) {
5941
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5942
- ml.n_created--; // artificial tensor
5943
- ml.size_data += ggml_nbytes(model.output);
6022
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5944
6023
  }
5945
6024
  }
5946
6025
 
@@ -6001,9 +6080,7 @@ static bool llm_load_tensors(
6001
6080
  {
6002
6081
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6003
6082
  // init output from the input tok embed
6004
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6005
- ml.n_created--; // artificial tensor
6006
- ml.size_data += ggml_nbytes(model.output);
6083
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6007
6084
  }
6008
6085
 
6009
6086
  for (int i = 0; i < n_layer; ++i) {
@@ -6035,12 +6112,10 @@ static bool llm_load_tensors(
6035
6112
 
6036
6113
  // output
6037
6114
  {
6038
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6115
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6039
6116
  // if output is NULL, init from the input tok embed
6040
6117
  if (model.output == NULL) {
6041
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6042
- ml.n_created--; // artificial tensor
6043
- ml.size_data += ggml_nbytes(model.output);
6118
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6044
6119
  }
6045
6120
  }
6046
6121
 
@@ -6060,6 +6135,81 @@ static bool llm_load_tensors(
6060
6135
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6061
6136
  }
6062
6137
  } break;
6138
+ case LLM_ARCH_GPTNEOX:
6139
+ {
6140
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6141
+ // output
6142
+ {
6143
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6144
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6145
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6146
+ }
6147
+
6148
+ for (int i = 0; i < n_layer; ++i) {
6149
+ ggml_context * ctx_layer = ctx_for_layer(i);
6150
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6151
+
6152
+ auto & layer = model.layers[i];
6153
+
6154
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6155
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6156
+
6157
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6158
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6159
+
6160
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6161
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6162
+
6163
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6164
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6165
+
6166
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6167
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6168
+
6169
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6170
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6171
+ }
6172
+ } break;
6173
+ case LLM_ARCH_ARCTIC:
6174
+ {
6175
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6176
+
6177
+ // output
6178
+ {
6179
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6180
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6181
+ // if output is NULL, init from the input tok embed
6182
+ if (model.output == NULL) {
6183
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6184
+ }
6185
+ }
6186
+
6187
+ for (int i = 0; i < n_layer; ++i) {
6188
+ ggml_context * ctx_layer = ctx_for_layer(i);
6189
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6190
+
6191
+ auto & layer = model.layers[i];
6192
+
6193
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6194
+
6195
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6196
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6197
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6198
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6199
+
6200
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6201
+
6202
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6203
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6204
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6205
+
6206
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6207
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6208
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6209
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6210
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
+ }
6212
+ } break;
6063
6213
  default:
6064
6214
  throw std::runtime_error("unknown architecture");
6065
6215
  }
@@ -6324,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6324
6474
 
6325
6475
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6326
6476
  } else {
6327
- #ifdef GGML_USE_MPI
6328
- GGML_ASSERT(false && "not implemented");
6329
- #endif
6330
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6477
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6331
6478
  inpL = lctx.inp_embd;
6332
6479
  ggml_set_input(lctx.inp_embd);
6333
6480
  }
@@ -6652,7 +6799,7 @@ static struct ggml_tensor * llm_build_kqv(
6652
6799
 
6653
6800
  cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6654
6801
 
6655
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6802
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6656
6803
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6657
6804
  }
6658
6805
 
@@ -6661,7 +6808,7 @@ static struct ggml_tensor * llm_build_kqv(
6661
6808
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6662
6809
  cb(kq, "kq", il);
6663
6810
 
6664
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6811
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6665
6812
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6666
6813
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6667
6814
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6886,17 +7033,20 @@ struct llm_build_context {
6886
7033
  cb(lctx.inp_K_shift, "K_shift", -1);
6887
7034
  ggml_set_input(lctx.inp_K_shift);
6888
7035
 
7036
+
6889
7037
  for (int il = 0; il < n_layer; ++il) {
7038
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6890
7039
  struct ggml_tensor * tmp =
6891
7040
  // we rotate only the first n_rot dimensions
6892
- ggml_rope_custom_inplace(ctx0,
7041
+ ggml_rope_ext_inplace(ctx0,
6893
7042
  ggml_view_3d(ctx0, kv_self.k_l[il],
6894
7043
  n_embd_head_k, n_head_kv, n_ctx,
6895
7044
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6896
7045
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6897
7046
  0),
6898
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6899
7048
  ext_factor, attn_factor, beta_fast, beta_slow);
7049
+
6900
7050
  cb(tmp, "K_shifted", il);
6901
7051
  ggml_build_forward_expand(gf, tmp);
6902
7052
  }
@@ -6999,6 +7149,17 @@ struct llm_build_context {
6999
7149
  return lctx.inp_pos;
7000
7150
  }
7001
7151
 
7152
+ struct ggml_tensor * build_rope_factors(int il) {
7153
+ // choose long/short freq factors based on the context size
7154
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7155
+
7156
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7157
+ return model.layers[il].rope_long;
7158
+ }
7159
+
7160
+ return model.layers[il].rope_short;
7161
+ }
7162
+
7002
7163
  struct ggml_tensor * build_inp_out_ids() {
7003
7164
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
7004
7165
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -7106,15 +7267,15 @@ struct llm_build_context {
7106
7267
  cb(Vcur, "Vcur", il);
7107
7268
  }
7108
7269
 
7109
- Qcur = ggml_rope_custom(
7110
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7270
+ Qcur = ggml_rope_ext(
7271
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7111
7272
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7112
7273
  ext_factor, attn_factor, beta_fast, beta_slow
7113
7274
  );
7114
7275
  cb(Qcur, "Qcur", il);
7115
7276
 
7116
- Kcur = ggml_rope_custom(
7117
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7277
+ Kcur = ggml_rope_ext(
7278
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7118
7279
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7119
7280
  ext_factor, attn_factor, beta_fast, beta_slow
7120
7281
  );
@@ -7236,13 +7397,13 @@ struct llm_build_context {
7236
7397
 
7237
7398
  switch (model.type) {
7238
7399
  case MODEL_7B:
7239
- Qcur = ggml_rope_custom(
7240
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ Qcur = ggml_rope_ext(
7401
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7241
7402
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7242
7403
  ext_factor, attn_factor, beta_fast, beta_slow
7243
7404
  );
7244
- Kcur = ggml_rope_custom(
7245
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7405
+ Kcur = ggml_rope_ext(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7246
7407
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7408
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7409
  );
@@ -7348,15 +7509,15 @@ struct llm_build_context {
7348
7509
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7349
7510
  cb(Vcur, "Vcur", il);
7350
7511
 
7351
- Qcur = ggml_rope_custom(
7352
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7512
+ Qcur = ggml_rope_ext(
7513
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7353
7514
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7354
7515
  ext_factor, attn_factor, beta_fast, beta_slow
7355
7516
  );
7356
7517
  cb(Qcur, "Qcur", il);
7357
7518
 
7358
- Kcur = ggml_rope_custom(
7359
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7519
+ Kcur = ggml_rope_ext(
7520
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7360
7521
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7361
7522
  ext_factor, attn_factor, beta_fast, beta_slow
7362
7523
  );
@@ -7469,14 +7630,14 @@ struct llm_build_context {
7469
7630
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7470
7631
 
7471
7632
  // using mode = 2 for neox mode
7472
- Qcur = ggml_rope_custom(
7473
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7633
+ Qcur = ggml_rope_ext(
7634
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7474
7635
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7475
7636
  );
7476
7637
  cb(Qcur, "Qcur", il);
7477
7638
 
7478
- Kcur = ggml_rope_custom(
7479
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7639
+ Kcur = ggml_rope_ext(
7640
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7480
7641
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7481
7642
  );
7482
7643
  cb(Kcur, "Kcur", il);
@@ -7592,15 +7753,15 @@ struct llm_build_context {
7592
7753
  cb(Vcur, "Vcur", il);
7593
7754
  }
7594
7755
 
7595
- Qcur = ggml_rope_custom(
7596
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7756
+ Qcur = ggml_rope_ext(
7757
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7597
7758
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7598
7759
  ext_factor, attn_factor, beta_fast, beta_slow
7599
7760
  );
7600
7761
  cb(Qcur, "Qcur", il);
7601
7762
 
7602
- Kcur = ggml_rope_custom(
7603
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7763
+ Kcur = ggml_rope_ext(
7764
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7604
7765
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
7766
  ext_factor, attn_factor, beta_fast, beta_slow
7606
7767
  );
@@ -7744,15 +7905,15 @@ struct llm_build_context {
7744
7905
  cb(Kcur, "Kcur", il);
7745
7906
  cb(Vcur, "Vcur", il);
7746
7907
 
7747
- Qcur = ggml_rope_custom(
7748
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7908
+ Qcur = ggml_rope_ext(
7909
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7749
7910
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7750
7911
  ext_factor, attn_factor, beta_fast, beta_slow
7751
7912
  );
7752
7913
  cb(Qcur, "Qcur", il);
7753
7914
 
7754
- Kcur = ggml_rope_custom(
7755
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7915
+ Kcur = ggml_rope_ext(
7916
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7756
7917
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7757
7918
  ext_factor, attn_factor, beta_fast, beta_slow
7758
7919
  );
@@ -7921,256 +8082,49 @@ struct llm_build_context {
7921
8082
  return gf;
7922
8083
  }
7923
8084
 
7924
- struct ggml_cgraph * build_persimmon() {
8085
+ struct ggml_cgraph * build_refact() {
7925
8086
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7926
8087
 
7927
8088
  const int64_t n_embd_head = hparams.n_embd_head_v;
7928
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7929
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
8089
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7930
8090
 
7931
8091
  struct ggml_tensor * cur;
7932
8092
  struct ggml_tensor * inpL;
7933
8093
 
7934
8094
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7935
8095
 
7936
- // inp_pos - contains the positions
7937
- struct ggml_tensor * inp_pos = build_inp_pos();
7938
-
7939
8096
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7940
8097
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7941
8098
 
7942
8099
  for (int il = 0; il < n_layer; ++il) {
7943
- struct ggml_tensor * residual = inpL;
8100
+ struct ggml_tensor * inpSA = inpL;
7944
8101
 
7945
8102
  cur = llm_build_norm(ctx0, inpL, hparams,
7946
- model.layers[il].attn_norm,
7947
- model.layers[il].attn_norm_b,
7948
- LLM_NORM, cb, il);
8103
+ model.layers[il].attn_norm, NULL,
8104
+ LLM_NORM_RMS, cb, il);
7949
8105
  cb(cur, "attn_norm", il);
7950
8106
 
7951
- // self attention
8107
+ // self-attention
7952
8108
  {
7953
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7954
- cb(cur, "wqkv", il);
8109
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8110
+ cb(Qcur, "Qcur", il);
7955
8111
 
7956
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7957
- cb(cur, "bqkv", il);
8112
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8113
+ cb(Kcur, "Kcur", il);
7958
8114
 
7959
- // split qkv
7960
- GGML_ASSERT(n_head_kv == n_head);
8115
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8116
+ cb(Vcur, "Vcur", il);
7961
8117
 
7962
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7963
- cb(tmpqkv, "tmpqkv", il);
8118
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8119
+ cb(Kcur, "Kcur", il);
7964
8120
 
7965
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7966
- cb(tmpqkv_perm, "tmpqkv", il);
8121
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8122
+ cb(Qcur, "Qcur", il);
7967
8123
 
7968
- struct ggml_tensor * tmpq = ggml_view_3d(
7969
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7970
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7971
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7972
- 0
7973
- );
7974
- cb(tmpq, "tmpq", il);
7975
-
7976
- struct ggml_tensor * tmpk = ggml_view_3d(
7977
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7978
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7979
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7980
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7981
- );
7982
- cb(tmpk, "tmpk", il);
7983
-
7984
- // Q/K Layernorm
7985
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7986
- model.layers[il].attn_q_norm,
7987
- model.layers[il].attn_q_norm_b,
7988
- LLM_NORM, cb, il);
7989
- cb(tmpq, "tmpq", il);
7990
-
7991
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7992
- model.layers[il].attn_k_norm,
7993
- model.layers[il].attn_k_norm_b,
7994
- LLM_NORM, cb, il);
7995
- cb(tmpk, "tmpk", il);
7996
-
7997
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7998
- struct ggml_tensor * qrot = ggml_view_3d(
7999
- ctx0, tmpq, n_rot, n_head, n_tokens,
8000
- ggml_element_size(tmpq) * n_embd_head,
8001
- ggml_element_size(tmpq) * n_embd_head * n_head,
8002
- 0
8003
- );
8004
- cb(qrot, "qrot", il);
8005
-
8006
- struct ggml_tensor * krot = ggml_view_3d(
8007
- ctx0, tmpk, n_rot, n_head, n_tokens,
8008
- ggml_element_size(tmpk) * n_embd_head,
8009
- ggml_element_size(tmpk) * n_embd_head * n_head,
8010
- 0
8011
- );
8012
- cb(krot, "krot", il);
8013
-
8014
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
8015
- struct ggml_tensor * qpass = ggml_view_3d(
8016
- ctx0, tmpq, n_rot, n_head, n_tokens,
8017
- ggml_element_size(tmpq) * n_embd_head,
8018
- ggml_element_size(tmpq) * n_embd_head * n_head,
8019
- ggml_element_size(tmpq) * n_rot
8020
- );
8021
- cb(qpass, "qpass", il);
8022
-
8023
- struct ggml_tensor * kpass = ggml_view_3d(
8024
- ctx0, tmpk, n_rot, n_head, n_tokens,
8025
- ggml_element_size(tmpk) * n_embd_head,
8026
- ggml_element_size(tmpk) * n_embd_head * n_head,
8027
- ggml_element_size(tmpk) * n_rot
8028
- );
8029
- cb(kpass, "kpass", il);
8030
-
8031
- struct ggml_tensor * qrotated = ggml_rope_custom(
8032
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8033
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8034
- );
8035
- cb(qrotated, "qrotated", il);
8036
-
8037
- struct ggml_tensor * krotated = ggml_rope_custom(
8038
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8039
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8040
- );
8041
- cb(krotated, "krotated", il);
8042
-
8043
- // ggml currently only supports concatenation on dim=2
8044
- // so we need to permute qrot, qpass, concat, then permute back.
8045
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
8046
- cb(qrotated, "qrotated", il);
8047
-
8048
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
8049
- cb(krotated, "krotated", il);
8050
-
8051
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
8052
- cb(qpass, "qpass", il);
8053
-
8054
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
8055
- cb(kpass, "kpass", il);
8056
-
8057
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
8058
- cb(Qcur, "Qcur", il);
8059
-
8060
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
8061
- cb(Kcur, "Kcur", il);
8062
-
8063
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
8064
- cb(Q, "Q", il);
8065
-
8066
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8067
- cb(Kcur, "Kcur", il);
8068
-
8069
- struct ggml_tensor * Vcur = ggml_view_3d(
8070
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8071
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8072
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8073
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8074
- );
8075
- cb(Vcur, "Vcur", il);
8076
-
8077
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8078
- model.layers[il].wo, model.layers[il].bo,
8079
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8080
- }
8081
-
8082
- if (il == n_layer - 1) {
8083
- // skip computing output for unused tokens
8084
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8085
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8086
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8087
- }
8088
-
8089
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8090
- cb(ffn_inp, "ffn_inp", il);
8091
-
8092
- // feed-forward network
8093
- {
8094
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8095
- model.layers[il].ffn_norm,
8096
- model.layers[il].ffn_norm_b,
8097
- LLM_NORM, cb, il);
8098
- cb(cur, "ffn_norm", il);
8099
-
8100
- cur = llm_build_ffn(ctx0, cur,
8101
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8102
- NULL, NULL,
8103
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8104
- NULL,
8105
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8106
- cb(cur, "ffn_out", il);
8107
- }
8108
-
8109
- cur = ggml_add(ctx0, cur, ffn_inp);
8110
- cb(cur, "l_out", il);
8111
-
8112
- inpL = cur;
8113
- }
8114
-
8115
- cur = inpL;
8116
-
8117
- cur = llm_build_norm(ctx0, cur, hparams,
8118
- model.output_norm,
8119
- model.output_norm_b,
8120
- LLM_NORM, cb, -1);
8121
- cb(cur, "result_norm", -1);
8122
-
8123
- cur = ggml_mul_mat(ctx0, model.output, cur);
8124
- cb(cur, "result_output", -1);
8125
-
8126
- ggml_build_forward_expand(gf, cur);
8127
-
8128
- return gf;
8129
- }
8130
-
8131
- struct ggml_cgraph * build_refact() {
8132
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8133
-
8134
- const int64_t n_embd_head = hparams.n_embd_head_v;
8135
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8136
-
8137
- struct ggml_tensor * cur;
8138
- struct ggml_tensor * inpL;
8139
-
8140
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8141
-
8142
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8143
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8144
-
8145
- for (int il = 0; il < n_layer; ++il) {
8146
- struct ggml_tensor * inpSA = inpL;
8147
-
8148
- cur = llm_build_norm(ctx0, inpL, hparams,
8149
- model.layers[il].attn_norm, NULL,
8150
- LLM_NORM_RMS, cb, il);
8151
- cb(cur, "attn_norm", il);
8152
-
8153
- // self-attention
8154
- {
8155
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8156
- cb(Qcur, "Qcur", il);
8157
-
8158
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8159
- cb(Kcur, "Kcur", il);
8160
-
8161
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8162
- cb(Vcur, "Vcur", il);
8163
-
8164
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8165
- cb(Kcur, "Kcur", il);
8166
-
8167
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8168
- cb(Qcur, "Qcur", il);
8169
-
8170
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8171
- model.layers[il].wo, NULL,
8172
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8173
- }
8124
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8125
+ model.layers[il].wo, NULL,
8126
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8127
+ }
8174
8128
 
8175
8129
  if (il == n_layer - 1) {
8176
8130
  // skip computing output for unused tokens
@@ -8304,15 +8258,15 @@ struct llm_build_context {
8304
8258
  cb(Kcur, "Kcur", il);
8305
8259
  cb(Vcur, "Vcur", il);
8306
8260
 
8307
- Qcur = ggml_rope_custom(
8308
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8261
+ Qcur = ggml_rope_ext(
8262
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8309
8263
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8310
8264
  ext_factor, attn_factor, beta_fast, beta_slow
8311
8265
  );
8312
8266
  cb(Qcur, "Qcur", il);
8313
8267
 
8314
- Kcur = ggml_rope_custom(
8315
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8268
+ Kcur = ggml_rope_ext(
8269
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8316
8270
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8317
8271
  ext_factor, attn_factor, beta_fast, beta_slow
8318
8272
  );
@@ -8744,15 +8698,15 @@ struct llm_build_context {
8744
8698
  }
8745
8699
 
8746
8700
 
8747
- Qcur = ggml_rope_custom(
8748
- ctx0, Qcur, inp_pos,
8701
+ Qcur = ggml_rope_ext(
8702
+ ctx0, Qcur, inp_pos, nullptr,
8749
8703
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8750
8704
  ext_factor, attn_factor, beta_fast, beta_slow
8751
8705
  );
8752
8706
  cb(Qcur, "Qcur", il);
8753
8707
 
8754
- Kcur = ggml_rope_custom(
8755
- ctx0, Kcur, inp_pos,
8708
+ Kcur = ggml_rope_ext(
8709
+ ctx0, Kcur, inp_pos, nullptr,
8756
8710
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8757
8711
  ext_factor, attn_factor, beta_fast, beta_slow
8758
8712
  );
@@ -8864,14 +8818,14 @@ struct llm_build_context {
8864
8818
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8865
8819
 
8866
8820
  // using mode = 2 for neox mode
8867
- Qcur = ggml_rope_custom(
8868
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8821
+ Qcur = ggml_rope_ext(
8822
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8869
8823
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8870
8824
  );
8871
8825
  cb(Qcur, "Qcur", il);
8872
8826
 
8873
- Kcur = ggml_rope_custom(
8874
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8827
+ Kcur = ggml_rope_ext(
8828
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8875
8829
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8876
8830
  );
8877
8831
  cb(Kcur, "Kcur", il);
@@ -8975,15 +8929,15 @@ struct llm_build_context {
8975
8929
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8976
8930
  cb(Vcur, "Vcur", il);
8977
8931
 
8978
- Qcur = ggml_rope_custom(
8979
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8932
+ Qcur = ggml_rope_ext(
8933
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8980
8934
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8981
8935
  ext_factor, attn_factor, beta_fast, beta_slow
8982
8936
  );
8983
8937
  cb(Qcur, "Qcur", il);
8984
8938
 
8985
- Kcur = ggml_rope_custom(
8986
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8939
+ Kcur = ggml_rope_ext(
8940
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8987
8941
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8988
8942
  ext_factor, attn_factor, beta_fast, beta_slow
8989
8943
  );
@@ -9089,15 +9043,15 @@ struct llm_build_context {
9089
9043
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9090
9044
  cb(Vcur, "Vcur", il);
9091
9045
 
9092
- Qcur = ggml_rope_custom(
9093
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9046
+ Qcur = ggml_rope_ext(
9047
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9094
9048
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9095
9049
  ext_factor, attn_factor, beta_fast, beta_slow
9096
9050
  );
9097
9051
  cb(Qcur, "Qcur", il);
9098
9052
 
9099
- Kcur = ggml_rope_custom(
9100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9053
+ Kcur = ggml_rope_ext(
9054
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9101
9055
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9102
9056
  ext_factor, attn_factor, beta_fast, beta_slow
9103
9057
  );
@@ -9241,8 +9195,8 @@ struct llm_build_context {
9241
9195
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9242
9196
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9243
9197
 
9244
- Qcur = ggml_rope_custom(
9245
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9198
+ Qcur = ggml_rope_ext(
9199
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9246
9200
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9247
9201
  );
9248
9202
  cb(Qcur, "Qcur", il);
@@ -9252,8 +9206,8 @@ struct llm_build_context {
9252
9206
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9253
9207
  cb(Qcur, "Qcur", il);
9254
9208
 
9255
- Kcur = ggml_rope_custom(
9256
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9209
+ Kcur = ggml_rope_ext(
9210
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9257
9211
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9258
9212
  );
9259
9213
  cb(Kcur, "Kcur", il);
@@ -9329,6 +9283,9 @@ struct llm_build_context {
9329
9283
 
9330
9284
  // self-attention
9331
9285
  {
9286
+ // rope freq factors for 128k context
9287
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9288
+
9332
9289
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9333
9290
  model.layers[il].attn_norm,
9334
9291
  NULL,
@@ -9360,8 +9317,8 @@ struct llm_build_context {
9360
9317
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9361
9318
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9362
9319
 
9363
- Qcur = ggml_rope_custom(
9364
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9320
+ Qcur = ggml_rope_ext(
9321
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9365
9322
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9366
9323
  );
9367
9324
  cb(Qcur, "Qcur", il);
@@ -9369,8 +9326,8 @@ struct llm_build_context {
9369
9326
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9370
9327
  cb(Qcur, "Qcur", il);
9371
9328
 
9372
- Kcur = ggml_rope_custom(
9373
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9329
+ Kcur = ggml_rope_ext(
9330
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9374
9331
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9375
9332
  );
9376
9333
  cb(Kcur, "Kcur", il);
@@ -9476,14 +9433,14 @@ struct llm_build_context {
9476
9433
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9477
9434
  cb(Vcur, "Vcur", il);
9478
9435
 
9479
- Qcur = ggml_rope_custom(
9480
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9436
+ Qcur = ggml_rope_ext(
9437
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9481
9438
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9482
9439
  ext_factor, attn_factor, beta_fast, beta_slow);
9483
9440
  cb(Qcur, "Qcur", il);
9484
9441
 
9485
- Kcur = ggml_rope_custom(
9486
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9442
+ Kcur = ggml_rope_ext(
9443
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9487
9444
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9488
9445
  ext_factor, attn_factor, beta_fast, beta_slow);
9489
9446
  cb(Kcur, "Kcur", il);
@@ -9684,15 +9641,15 @@ struct llm_build_context {
9684
9641
  cb(tmpk, "tmpk", il);
9685
9642
  cb(Vcur, "Vcur", il);
9686
9643
 
9687
- struct ggml_tensor * Qcur = ggml_rope_custom(
9688
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9644
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9645
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9689
9646
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9690
9647
  ext_factor, attn_factor, beta_fast, beta_slow
9691
9648
  );
9692
9649
  cb(Qcur, "Qcur", il);
9693
9650
 
9694
- struct ggml_tensor * Kcur = ggml_rope_custom(
9695
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9651
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9652
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9696
9653
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9697
9654
  ext_factor, attn_factor, beta_fast, beta_slow
9698
9655
  );
@@ -9800,15 +9757,15 @@ struct llm_build_context {
9800
9757
  // cb(Vcur, "Vcur", il);
9801
9758
  // }
9802
9759
 
9803
- Qcur = ggml_rope_custom(
9804
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9760
+ Qcur = ggml_rope_ext(
9761
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9805
9762
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9806
9763
  ext_factor, attn_factor, beta_fast, beta_slow
9807
9764
  );
9808
9765
  cb(Qcur, "Qcur", il);
9809
9766
 
9810
- Kcur = ggml_rope_custom(
9811
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9767
+ Kcur = ggml_rope_ext(
9768
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9812
9769
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9813
9770
  ext_factor, attn_factor, beta_fast, beta_slow
9814
9771
  );
@@ -9917,15 +9874,15 @@ struct llm_build_context {
9917
9874
  cb(Vcur, "Vcur", il);
9918
9875
  }
9919
9876
 
9920
- Qcur = ggml_rope_custom(
9921
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9877
+ Qcur = ggml_rope_ext(
9878
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9922
9879
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9923
9880
  ext_factor, attn_factor, beta_fast, beta_slow
9924
9881
  );
9925
9882
  cb(Qcur, "Qcur", il);
9926
9883
 
9927
- Kcur = ggml_rope_custom(
9928
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9884
+ Kcur = ggml_rope_ext(
9885
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9929
9886
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9930
9887
  ext_factor, attn_factor, beta_fast, beta_slow
9931
9888
  );
@@ -10047,15 +10004,15 @@ struct llm_build_context {
10047
10004
  cb(Vcur, "Vcur", il);
10048
10005
  }
10049
10006
 
10050
- Qcur = ggml_rope_custom(
10051
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10007
+ Qcur = ggml_rope_ext(
10008
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10052
10009
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10053
10010
  ext_factor, attn_factor, beta_fast, beta_slow
10054
10011
  );
10055
10012
  cb(Qcur, "Qcur", il);
10056
10013
 
10057
- Kcur = ggml_rope_custom(
10058
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10014
+ Kcur = ggml_rope_ext(
10015
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10059
10016
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10060
10017
  ext_factor, attn_factor, beta_fast, beta_slow
10061
10018
  );
@@ -10167,8 +10124,8 @@ struct llm_build_context {
10167
10124
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10168
10125
  cb(Vcur, "Vcur", il);
10169
10126
 
10170
- Qcur = ggml_rope_custom(
10171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10127
+ Qcur = ggml_rope_ext(
10128
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10172
10129
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10173
10130
  ext_factor, attn_factor, beta_fast, beta_slow);
10174
10131
  cb(Qcur, "Qcur", il);
@@ -10176,8 +10133,8 @@ struct llm_build_context {
10176
10133
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10177
10134
  cb(Qcur, "Qcur_scaled", il);
10178
10135
 
10179
- Kcur = ggml_rope_custom(
10180
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10136
+ Kcur = ggml_rope_ext(
10137
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10181
10138
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10182
10139
  ext_factor, attn_factor, beta_fast, beta_slow);
10183
10140
  cb(Kcur, "Kcur", il);
@@ -10287,15 +10244,15 @@ struct llm_build_context {
10287
10244
  cb(Vcur, "Vcur", il);
10288
10245
  }
10289
10246
 
10290
- Qcur = ggml_rope_custom(
10291
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10247
+ Qcur = ggml_rope_ext(
10248
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10292
10249
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10293
10250
  ext_factor, attn_factor, beta_fast, beta_slow
10294
10251
  );
10295
10252
  cb(Qcur, "Qcur", il);
10296
10253
 
10297
- Kcur = ggml_rope_custom(
10298
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10254
+ Kcur = ggml_rope_ext(
10255
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10299
10256
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10300
10257
  ext_factor, attn_factor, beta_fast, beta_slow
10301
10258
  );
@@ -10577,15 +10534,15 @@ struct llm_build_context {
10577
10534
  cb(Kcur, "Kcur", il);
10578
10535
  }
10579
10536
 
10580
- Qcur = ggml_rope_custom(
10581
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10537
+ Qcur = ggml_rope_ext(
10538
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10582
10539
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10583
10540
  ext_factor, attn_factor, beta_fast, beta_slow
10584
10541
  );
10585
10542
  cb(Qcur, "Qcur", il);
10586
10543
 
10587
- Kcur = ggml_rope_custom(
10588
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10544
+ Kcur = ggml_rope_ext(
10545
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10589
10546
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10590
10547
  ext_factor, attn_factor, beta_fast, beta_slow
10591
10548
  );
@@ -10708,15 +10665,15 @@ struct llm_build_context {
10708
10665
  cb(Vcur, "Vcur", il);
10709
10666
  }
10710
10667
 
10711
- Qcur = ggml_rope_custom(
10712
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10668
+ Qcur = ggml_rope_ext(
10669
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10713
10670
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10714
10671
  ext_factor, attn_factor, beta_fast, beta_slow
10715
10672
  );
10716
10673
  cb(Qcur, "Qcur", il);
10717
10674
 
10718
- Kcur = ggml_rope_custom(
10719
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10675
+ Kcur = ggml_rope_ext(
10676
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10720
10677
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10721
10678
  ext_factor, attn_factor, beta_fast, beta_slow
10722
10679
  );
@@ -10780,6 +10737,274 @@ struct llm_build_context {
10780
10737
 
10781
10738
  return gf;
10782
10739
  }
10740
+
10741
+ struct ggml_cgraph * build_gptneox() {
10742
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10743
+
10744
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10745
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10746
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10747
+
10748
+ struct ggml_tensor * cur;
10749
+ struct ggml_tensor * inpL;
10750
+
10751
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10752
+
10753
+ // inp_pos - contains the positions
10754
+ struct ggml_tensor * inp_pos = build_inp_pos();
10755
+
10756
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10757
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10758
+
10759
+ for (int il = 0; il < n_layer; ++il) {
10760
+ cur = llm_build_norm(ctx0, inpL, hparams,
10761
+ model.layers[il].attn_norm,
10762
+ model.layers[il].attn_norm_b,
10763
+ LLM_NORM, cb, il);
10764
+ cb(cur, "attn_norm", il);
10765
+
10766
+ // self-attention
10767
+ {
10768
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10769
+ cb(cur, "wqkv", il);
10770
+
10771
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10772
+ cb(cur, "bqkv", il);
10773
+
10774
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10775
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10776
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10777
+
10778
+ cb(Qcur, "Qcur", il);
10779
+ cb(Kcur, "Kcur", il);
10780
+ cb(Vcur, "Vcur", il);
10781
+
10782
+ Qcur = ggml_rope_ext(
10783
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10784
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10785
+ ext_factor, attn_factor, beta_fast, beta_slow
10786
+ );
10787
+ cb(Qcur, "Qcur", il);
10788
+
10789
+ Kcur = ggml_rope_ext(
10790
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10791
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10792
+ ext_factor, attn_factor, beta_fast, beta_slow
10793
+ );
10794
+ cb(Kcur, "Kcur", il);
10795
+
10796
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10797
+ model.layers[il].wo, model.layers[il].bo,
10798
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10799
+ }
10800
+
10801
+ if (il == n_layer - 1) {
10802
+ // skip computing output for unused tokens
10803
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10804
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10805
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10806
+ }
10807
+
10808
+ // ffn
10809
+ if (hparams.use_par_res) {
10810
+ // attention and ffn are computed in parallel
10811
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10812
+
10813
+ struct ggml_tensor * attn_out = cur;
10814
+
10815
+ cur = llm_build_norm(ctx0, inpL, hparams,
10816
+ model.layers[il].ffn_norm,
10817
+ model.layers[il].ffn_norm_b,
10818
+ LLM_NORM, cb, il);
10819
+ cb(cur, "ffn_norm", il);
10820
+
10821
+ cur = llm_build_ffn(ctx0, cur,
10822
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10823
+ NULL, NULL,
10824
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10825
+ NULL,
10826
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10827
+ cb(cur, "ffn_out", il);
10828
+
10829
+ cur = ggml_add(ctx0, cur, inpL);
10830
+ cb(cur, "ffn_out", il);
10831
+
10832
+ inpL = ggml_add(ctx0, cur, attn_out);
10833
+ cb(inpL, "l_out", il);
10834
+ } else {
10835
+ // attention and ffn are computed sequentially
10836
+ // x = x + attn(ln1(x))
10837
+ // x = x + ffn(ln2(x))
10838
+
10839
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10840
+ cb(ffn_inp, "ffn_inp", il);
10841
+
10842
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10843
+ model.layers[il].ffn_norm,
10844
+ model.layers[il].ffn_norm_b,
10845
+ LLM_NORM, cb, il);
10846
+ cb(cur, "ffn_norm", il);
10847
+
10848
+ cur = llm_build_ffn(ctx0, cur,
10849
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10850
+ NULL, NULL,
10851
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10852
+ NULL,
10853
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10854
+ cb(cur, "ffn_out", il);
10855
+
10856
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10857
+ cb(inpL, "l_out", il);
10858
+ }
10859
+ }
10860
+
10861
+ cur = llm_build_norm(ctx0, inpL, hparams,
10862
+ model.output_norm,
10863
+ model.output_norm_b,
10864
+ LLM_NORM, cb, -1);
10865
+ cb(cur, "result_norm", -1);
10866
+
10867
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10868
+ cb(cur, "result_output", -1);
10869
+
10870
+ ggml_build_forward_expand(gf, cur);
10871
+
10872
+ return gf;
10873
+ }
10874
+
10875
+ struct ggml_cgraph * build_arctic() {
10876
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10877
+
10878
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10879
+ int32_t n_tokens = this->n_tokens;
10880
+
10881
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10882
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10883
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10884
+
10885
+ struct ggml_tensor * cur;
10886
+ struct ggml_tensor * inpL;
10887
+
10888
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10889
+
10890
+ // inp_pos - contains the positions
10891
+ struct ggml_tensor * inp_pos = build_inp_pos();
10892
+
10893
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10894
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10895
+
10896
+ for (int il = 0; il < n_layer; ++il) {
10897
+ struct ggml_tensor * inpSA = inpL;
10898
+
10899
+ // norm
10900
+ cur = llm_build_norm(ctx0, inpL, hparams,
10901
+ model.layers[il].attn_norm, NULL,
10902
+ LLM_NORM_RMS, cb, il);
10903
+ cb(cur, "attn_norm", il);
10904
+
10905
+ // self-attention
10906
+ {
10907
+ // compute Q and K and RoPE them
10908
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10909
+ cb(Qcur, "Qcur", il);
10910
+
10911
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10912
+ cb(Kcur, "Kcur", il);
10913
+
10914
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10915
+ cb(Vcur, "Vcur", il);
10916
+
10917
+ Qcur = ggml_rope_ext(
10918
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10919
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10920
+ ext_factor, attn_factor, beta_fast, beta_slow
10921
+ );
10922
+ cb(Qcur, "Qcur", il);
10923
+
10924
+ Kcur = ggml_rope_ext(
10925
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10926
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10927
+ ext_factor, attn_factor, beta_fast, beta_slow
10928
+ );
10929
+ cb(Kcur, "Kcur", il);
10930
+
10931
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10932
+ model.layers[il].wo, NULL,
10933
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10934
+ }
10935
+
10936
+ if (il == n_layer - 1) {
10937
+ // skip computing output for unused tokens
10938
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10939
+ n_tokens = n_outputs;
10940
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10941
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10942
+ }
10943
+
10944
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10945
+ cb(ffn_inp, "ffn_inp", il);
10946
+
10947
+ // feed-forward network
10948
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10949
+ model.layers[il].ffn_norm, NULL,
10950
+ LLM_NORM_RMS, cb, il);
10951
+ cb(cur, "ffn_norm", il);
10952
+
10953
+ cur = llm_build_ffn(ctx0, cur,
10954
+ model.layers[il].ffn_up, NULL,
10955
+ model.layers[il].ffn_gate, NULL,
10956
+ model.layers[il].ffn_down, NULL,
10957
+ NULL,
10958
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10959
+ cb(cur, "ffn_out", il);
10960
+
10961
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
10962
+ cb(ffn_out, "ffn_out", il);
10963
+
10964
+ // MoE
10965
+ cur = llm_build_norm(ctx0, inpSA, hparams,
10966
+ model.layers[il].ffn_norm_exps, NULL,
10967
+ LLM_NORM_RMS, cb, il);
10968
+ cb(cur, "ffn_norm_exps", il);
10969
+
10970
+ cur = llm_build_moe_ffn(ctx0, cur,
10971
+ model.layers[il].ffn_gate_inp,
10972
+ model.layers[il].ffn_up_exps,
10973
+ model.layers[il].ffn_gate_exps,
10974
+ model.layers[il].ffn_down_exps,
10975
+ n_expert, n_expert_used,
10976
+ LLM_FFN_SILU, true,
10977
+ cb, il);
10978
+ cb(cur, "ffn_moe_out", il);
10979
+
10980
+ cur = ggml_add(ctx0, cur, ffn_out);
10981
+ cb(cur, "ffn_out", il);
10982
+
10983
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10984
+ if (layer_dir != nullptr) {
10985
+ cur = ggml_add(ctx0, cur, layer_dir);
10986
+ }
10987
+ cb(cur, "l_out", il);
10988
+
10989
+ // input for next layer
10990
+ inpL = cur;
10991
+ }
10992
+
10993
+ cur = inpL;
10994
+
10995
+ cur = llm_build_norm(ctx0, cur, hparams,
10996
+ model.output_norm, NULL,
10997
+ LLM_NORM_RMS, cb, -1);
10998
+ cb(cur, "result_norm", -1);
10999
+
11000
+ // lm_head
11001
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11002
+ cb(cur, "result_output", -1);
11003
+
11004
+ ggml_build_forward_expand(gf, cur);
11005
+
11006
+ return gf;
11007
+ }
10783
11008
  };
10784
11009
 
10785
11010
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -10896,10 +11121,6 @@ static struct ggml_cgraph * llama_build_graph(
10896
11121
  {
10897
11122
  result = llm.build_starcoder();
10898
11123
  } break;
10899
- case LLM_ARCH_PERSIMMON:
10900
- {
10901
- result = llm.build_persimmon();
10902
- } break;
10903
11124
  case LLM_ARCH_REFACT:
10904
11125
  {
10905
11126
  result = llm.build_refact();
@@ -10994,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
10994
11215
  {
10995
11216
  result = llm.build_olmo();
10996
11217
  } break;
11218
+ case LLM_ARCH_GPTNEOX:
11219
+ {
11220
+ result = llm.build_gptneox();
11221
+ } break;
11222
+ case LLM_ARCH_ARCTIC:
11223
+ {
11224
+ result = llm.build_arctic();
11225
+ } break;
10997
11226
  default:
10998
11227
  GGML_ASSERT(false);
10999
11228
  }
@@ -11339,11 +11568,6 @@ static void llama_graph_compute(
11339
11568
  llama_context & lctx,
11340
11569
  ggml_cgraph * gf,
11341
11570
  int n_threads) {
11342
- #ifdef GGML_USE_MPI
11343
- const int64_t n_layer = lctx.model.hparams.n_layer;
11344
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11345
- #endif
11346
-
11347
11571
  #ifdef GGML_USE_METAL
11348
11572
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11349
11573
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11358,10 +11582,6 @@ static void llama_graph_compute(
11358
11582
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11359
11583
 
11360
11584
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11361
-
11362
- #ifdef GGML_USE_MPI
11363
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11364
- #endif
11365
11585
  }
11366
11586
 
11367
11587
  // decode a batch of tokens by evaluating the transformer
@@ -11399,12 +11619,6 @@ static int llama_decode_internal(
11399
11619
  }
11400
11620
  lctx.n_queued_tokens += n_tokens_all;
11401
11621
 
11402
- #ifdef GGML_USE_MPI
11403
- // TODO: needs fix after #3228
11404
- GGML_ASSERT(false && "not implemented");
11405
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11406
- #endif
11407
-
11408
11622
  auto & kv_self = lctx.kv_self;
11409
11623
 
11410
11624
  const int64_t n_embd = hparams.n_embd;
@@ -12354,6 +12568,7 @@ struct llm_tokenizer_bpe {
12354
12568
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12355
12569
  });
12356
12570
  break;
12571
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12357
12572
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12358
12573
  word_collection = unicode_regex_split(text, {
12359
12574
  // original regex from tokenizer.json
@@ -12788,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12788
13003
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12789
13004
  // tokenizer.encode('', add_special_tokens=False) returns []
12790
13005
 
13006
+ static const bool rtrim = true; //TODO: as param
13007
+ bool is_prev_special = false;
13008
+ bool special_token_rtrim = false;
13009
+
12791
13010
  if (add_special && vocab.special_add_bos != 0) {
12792
13011
  GGML_ASSERT(vocab.special_bos_id != -1);
12793
13012
  output.push_back(vocab.special_bos_id);
13013
+ is_prev_special = true;
12794
13014
  }
12795
13015
 
12796
13016
  for (const auto & fragment : fragment_buffer) {
@@ -12802,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12802
13022
  // and passing 'add space prefix' as bool argument
12803
13023
  //
12804
13024
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12805
- if (&fragment == &fragment_buffer.front()) {
12806
- if (vocab.add_space_prefix) {
12807
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13025
+
13026
+ if (special_token_rtrim) {
13027
+ size_t num_whitespaces = 0;
13028
+ while (isspace(raw_text[num_whitespaces])) {
13029
+ num_whitespaces++;
13030
+ }
13031
+ if (num_whitespaces == raw_text.size()) {
13032
+ continue; // skip if all whitespaces
13033
+ }
13034
+ raw_text = raw_text.substr(num_whitespaces);
13035
+ }
13036
+
13037
+ if (vocab.add_space_prefix) {
13038
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13039
+ raw_text = " " + raw_text;
12808
13040
  }
12809
13041
  }
12810
13042
 
@@ -12816,6 +13048,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12816
13048
  tokenizer.tokenize(raw_text, output);
12817
13049
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12818
13050
  output.push_back(fragment.token);
13051
+ is_prev_special = true;
13052
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13053
+ special_token_rtrim = rtrim
13054
+ && fragment.token != vocab.special_bos_id
13055
+ && fragment.token != vocab.special_unk_id
13056
+ && fragment.token != vocab.special_eos_id;
12819
13057
  }
12820
13058
  }
12821
13059
 
@@ -14518,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14518
14756
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14519
14757
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14520
14758
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14521
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14522
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14523
14759
  if (qs.model.type == MODEL_70B) {
14524
14760
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14525
14761
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15533,10 +15769,6 @@ void llama_backend_init(void) {
15533
15769
  struct ggml_context * ctx = ggml_init(params);
15534
15770
  ggml_free(ctx);
15535
15771
  }
15536
-
15537
- #ifdef GGML_USE_MPI
15538
- ggml_mpi_backend_init();
15539
- #endif
15540
15772
  }
15541
15773
 
15542
15774
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15546,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15546
15778
  }
15547
15779
 
15548
15780
  void llama_backend_free(void) {
15549
- #ifdef GGML_USE_MPI
15550
- ggml_mpi_backend_free();
15551
- #endif
15552
15781
  ggml_quantize_free();
15553
15782
  }
15554
15783
 
@@ -15691,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
15691
15920
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15692
15921
  }
15693
15922
 
15923
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15694
15924
  cparams.causal_attn = hparams.causal_attn;
15695
15925
 
15696
15926
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15949,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
15949
16179
  }
15950
16180
  }
15951
16181
 
15952
- #ifdef GGML_USE_MPI
15953
- ctx->ctx_mpi = ggml_mpi_init();
15954
-
15955
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15956
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15957
- // TODO: needs fix after #3228
15958
- GGML_ASSERT(false && "not implemented");
15959
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15960
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15961
- llama_backend_free();
15962
- exit(1);
15963
- }
15964
- #endif
15965
-
15966
16182
  return ctx;
15967
16183
  }
15968
16184
 
@@ -15999,7 +16215,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15999
16215
  // these models do not use RoPE
16000
16216
  case LLM_ARCH_GPT2:
16001
16217
  case LLM_ARCH_GPTJ:
16002
- case LLM_ARCH_GPTNEOX:
16003
16218
  case LLM_ARCH_MPT:
16004
16219
  case LLM_ARCH_REFACT:
16005
16220
  case LLM_ARCH_BLOOM:
@@ -16019,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16019
16234
  case LLM_ARCH_XVERSE:
16020
16235
  case LLM_ARCH_COMMAND_R:
16021
16236
  case LLM_ARCH_OLMO:
16237
+ case LLM_ARCH_ARCTIC:
16022
16238
  return LLAMA_ROPE_TYPE_NORM;
16023
16239
 
16024
16240
  // the pairs of head values are offset by n_rot/2
16025
16241
  case LLM_ARCH_FALCON:
16026
16242
  case LLM_ARCH_GROK:
16027
16243
  case LLM_ARCH_DBRX:
16028
- case LLM_ARCH_PERSIMMON:
16029
16244
  case LLM_ARCH_BERT:
16030
16245
  case LLM_ARCH_NOMIC_BERT:
16031
16246
  case LLM_ARCH_STABLELM:
@@ -16036,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16036
16251
  case LLM_ARCH_PHI3:
16037
16252
  case LLM_ARCH_GEMMA:
16038
16253
  case LLM_ARCH_STARCODER2:
16254
+ case LLM_ARCH_GPTNEOX:
16039
16255
  return LLAMA_ROPE_TYPE_NEOX;
16040
16256
 
16041
16257
  // all model arches should be listed explicitly here
@@ -16195,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16195
16411
  }
16196
16412
 
16197
16413
  // make tensors
16414
+ cvec.tensors.reserve(model.hparams.n_layer);
16198
16415
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16199
16416
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16200
16417
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16203,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16203
16420
  }
16204
16421
 
16205
16422
  // allocate tensors / buffers and zero
16423
+ cvec.ctxs.reserve(ctx_map.size());
16424
+ cvec.bufs.reserve(ctx_map.size());
16206
16425
  for (auto it : ctx_map) {
16207
16426
  ggml_backend_buffer_type_t buft = it.first;
16208
16427
  ggml_context * ctx = it.second;
@@ -17411,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17411
17630
  ctx->cparams.n_threads_batch = n_threads_batch;
17412
17631
  }
17413
17632
 
17633
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17634
+ return ctx->cparams.n_threads;
17635
+ }
17636
+
17637
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17638
+ return ctx->cparams.n_threads_batch;
17639
+ }
17640
+
17414
17641
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17415
17642
  ctx->abort_callback = abort_callback;
17416
17643
  ctx->abort_callback_data = abort_callback_data;
@@ -17845,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
17845
18072
  }
17846
18073
  }
17847
18074
  // llama2 templates seem to not care about "add_generation_prompt"
18075
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18076
+ // Phi 3
18077
+ for (auto message : chat) {
18078
+ std::string role(message->role);
18079
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18080
+ }
18081
+ if (add_ass) {
18082
+ ss << "<|assistant|>\n";
18083
+ }
17848
18084
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17849
18085
  // zephyr template
17850
18086
  for (auto message : chat) {
@@ -17977,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
17977
18213
  if (add_ass) {
17978
18214
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17979
18215
  }
17980
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17981
- // Phi 3
17982
- for (auto message : chat) {
17983
- std::string role(message->role);
17984
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17985
- }
17986
- if (add_ass) {
17987
- ss << "<|assistant|>\n";
17988
- }
17989
18216
  } else {
17990
18217
  // template not supported
17991
18218
  return -1;
@@ -18107,6 +18334,7 @@ const char * llama_print_system_info(void) {
18107
18334
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
18108
18335
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
18109
18336
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18337
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18110
18338
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18111
18339
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18112
18340
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
@@ -18167,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
18167
18395
  g_state.log_callback_user_data = user_data;
18168
18396
  #ifdef GGML_USE_METAL
18169
18397
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18398
+ #elif defined(GGML_USE_CUDA)
18399
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18170
18400
  #endif
18171
18401
  }
18172
18402