@fugood/llama.node 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/llama.cpp/CMakeLists.txt +72 -46
  20. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  21. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  22. package/src/llama.cpp/common/common.cpp +732 -752
  23. package/src/llama.cpp/common/common.h +47 -41
  24. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  25. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  26. package/src/llama.cpp/common/log.h +5 -5
  27. package/src/llama.cpp/common/sampling.cpp +89 -7
  28. package/src/llama.cpp/common/sampling.h +5 -0
  29. package/src/llama.cpp/common/train.cpp +2 -2
  30. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  31. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  32. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  33. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  34. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  35. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  36. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  37. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  38. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  39. package/src/llama.cpp/examples/llava/clip.h +1 -1
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  41. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  42. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  43. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  44. package/src/llama.cpp/examples/main/main.cpp +24 -16
  45. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  46. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  47. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  48. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  49. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  50. package/src/llama.cpp/examples/server/server.cpp +21 -9
  51. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  52. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  53. package/src/llama.cpp/ggml-backend.c +0 -1
  54. package/src/llama.cpp/ggml-common.h +0 -54
  55. package/src/llama.cpp/ggml-cuda.h +1 -0
  56. package/src/llama.cpp/ggml-impl.h +51 -0
  57. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  58. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  59. package/src/llama.cpp/ggml-quants.c +3700 -2041
  60. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  61. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  62. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  63. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  64. package/src/llama.cpp/ggml.c +1034 -1154
  65. package/src/llama.cpp/ggml.h +59 -31
  66. package/src/llama.cpp/llama.cpp +859 -609
  67. package/src/llama.cpp/llama.h +19 -6
  68. package/src/llama.cpp/requirements.txt +0 -1
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  70. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  71. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  72. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  73. package/src/llama.cpp/unicode-data.h +15 -12
  74. package/src/llama.cpp/unicode.cpp +89 -111
  75. package/src/llama.cpp/unicode.h +44 -12
  76. package/src/llama.cpp/build.zig +0 -172
  77. package/src/llama.cpp/ggml-mpi.c +0 -216
  78. package/src/llama.cpp/ggml-mpi.h +0 -39
  79. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -26,16 +26,9 @@
26
26
  #ifdef GGML_USE_METAL
27
27
  # include "ggml-metal.h"
28
28
  #endif
29
- #ifdef GGML_USE_MPI
30
- # include "ggml-mpi.h"
31
- #endif
32
- #ifndef QK_K
33
- # ifdef GGML_QKK_64
34
- # define QK_K 64
35
- # else
36
- # define QK_K 256
37
- # endif
38
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
39
32
 
40
33
  #ifdef __has_include
41
34
  #if __has_include(<unistd.h>)
@@ -110,7 +103,7 @@
110
103
  #endif
111
104
 
112
105
  #define LLAMA_MAX_NODES 8192
113
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 128
114
107
 
115
108
  //
116
109
  // logging
@@ -205,7 +198,6 @@ enum llm_arch {
205
198
  LLM_ARCH_GPTNEOX,
206
199
  LLM_ARCH_MPT,
207
200
  LLM_ARCH_STARCODER,
208
- LLM_ARCH_PERSIMMON,
209
201
  LLM_ARCH_REFACT,
210
202
  LLM_ARCH_BERT,
211
203
  LLM_ARCH_NOMIC_BERT,
@@ -229,6 +221,7 @@ enum llm_arch {
229
221
  LLM_ARCH_COMMAND_R,
230
222
  LLM_ARCH_DBRX,
231
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
232
225
  LLM_ARCH_UNKNOWN,
233
226
  };
234
227
 
@@ -242,7 +235,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
242
235
  { LLM_ARCH_MPT, "mpt" },
243
236
  { LLM_ARCH_BAICHUAN, "baichuan" },
244
237
  { LLM_ARCH_STARCODER, "starcoder" },
245
- { LLM_ARCH_PERSIMMON, "persimmon" },
246
238
  { LLM_ARCH_REFACT, "refact" },
247
239
  { LLM_ARCH_BERT, "bert" },
248
240
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
@@ -266,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
258
  { LLM_ARCH_COMMAND_R, "command-r" },
267
259
  { LLM_ARCH_DBRX, "dbrx" },
268
260
  { LLM_ARCH_OLMO, "olmo" },
261
+ { LLM_ARCH_ARCTIC, "arctic" },
269
262
  { LLM_ARCH_UNKNOWN, "(unknown)" },
270
263
  };
271
264
 
@@ -309,6 +302,7 @@ enum llm_kv {
309
302
  LLM_KV_ROPE_SCALE_LINEAR,
310
303
  LLM_KV_ROPE_SCALING_TYPE,
311
304
  LLM_KV_ROPE_SCALING_FACTOR,
305
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
312
306
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
313
307
  LLM_KV_ROPE_SCALING_FINETUNED,
314
308
 
@@ -386,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
380
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
387
381
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
388
382
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
383
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
389
384
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
390
385
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
391
386
 
@@ -441,6 +436,8 @@ enum llm_tensor {
441
436
  LLM_TENSOR_OUTPUT,
442
437
  LLM_TENSOR_OUTPUT_NORM,
443
438
  LLM_TENSOR_ROPE_FREQS,
439
+ LLM_TENSOR_ROPE_FACTORS_LONG,
440
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
444
441
  LLM_TENSOR_ATTN_Q,
445
442
  LLM_TENSOR_ATTN_K,
446
443
  LLM_TENSOR_ATTN_V,
@@ -460,6 +457,7 @@ enum llm_tensor {
460
457
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
461
458
  LLM_TENSOR_FFN_GATE_EXP,
462
459
  LLM_TENSOR_FFN_UP_EXP,
460
+ LLM_TENSOR_FFN_NORM_EXPS,
463
461
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
464
462
  LLM_TENSOR_FFN_GATE_EXPS,
465
463
  LLM_TENSOR_FFN_UP_EXPS,
@@ -598,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
598
596
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
599
597
  },
600
598
  },
601
- {
602
- LLM_ARCH_PERSIMMON,
603
- {
604
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
605
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
606
- { LLM_TENSOR_OUTPUT, "output"},
607
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
608
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
609
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
610
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
611
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
612
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
613
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
614
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
615
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
616
- },
617
- },
618
599
  {
619
600
  LLM_ARCH_MPT,
620
601
  {
@@ -825,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
825
806
  {
826
807
  LLM_ARCH_PHI3,
827
808
  {
828
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
- { LLM_TENSOR_OUTPUT, "output" },
831
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
833
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
834
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
835
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
836
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
837
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
838
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
839
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
809
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
810
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
811
+ { LLM_TENSOR_OUTPUT, "output" },
812
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
813
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
816
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
817
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
840
823
  },
841
824
  },
842
825
  {
@@ -1052,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1052
1035
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1053
1036
  },
1054
1037
  },
1038
+ {
1039
+ LLM_ARCH_ARCTIC,
1040
+ {
1041
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1042
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1043
+ { LLM_TENSOR_OUTPUT, "output" },
1044
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1050
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1051
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1052
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1053
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1054
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1055
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1056
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1057
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
+ },
1059
+ },
1055
1060
  {
1056
1061
  LLM_ARCH_UNKNOWN,
1057
1062
  {
@@ -1697,6 +1702,8 @@ struct llama_state {
1697
1702
  llama_state() {
1698
1703
  #ifdef GGML_USE_METAL
1699
1704
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1705
+ #elif defined(GGML_USE_CUDA)
1706
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1700
1707
  #endif
1701
1708
  }
1702
1709
 
@@ -1710,17 +1717,24 @@ static llama_state g_state;
1710
1717
  // available llama models
1711
1718
  enum e_model {
1712
1719
  MODEL_UNKNOWN,
1720
+ MODEL_14M,
1713
1721
  MODEL_17M,
1714
1722
  MODEL_22M,
1715
1723
  MODEL_33M,
1724
+ MODEL_70M,
1716
1725
  MODEL_109M,
1717
1726
  MODEL_137M,
1727
+ MODEL_160M,
1718
1728
  MODEL_335M,
1729
+ MODEL_410M,
1719
1730
  MODEL_0_5B,
1720
1731
  MODEL_1B,
1732
+ MODEL_1_4B,
1721
1733
  MODEL_2B,
1734
+ MODEL_2_8B,
1722
1735
  MODEL_3B,
1723
1736
  MODEL_4B,
1737
+ MODEL_6_9B,
1724
1738
  MODEL_7B,
1725
1739
  MODEL_8B,
1726
1740
  MODEL_12B,
@@ -1743,6 +1757,7 @@ enum e_model {
1743
1757
  MODEL_8x7B,
1744
1758
  MODEL_8x22B,
1745
1759
  MODEL_16x12B,
1760
+ MODEL_10B_128x3_66B,
1746
1761
  };
1747
1762
 
1748
1763
  static const size_t kiB = 1024;
@@ -1752,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
1752
1767
  struct llama_hparams {
1753
1768
  bool vocab_only;
1754
1769
  bool rope_finetuned;
1770
+ bool use_par_res;
1755
1771
 
1756
1772
  uint32_t n_vocab;
1757
1773
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1770,6 +1786,7 @@ struct llama_hparams {
1770
1786
  float f_norm_eps;
1771
1787
  float f_norm_rms_eps;
1772
1788
 
1789
+ float rope_attn_factor = 1.0f;
1773
1790
  float rope_freq_base_train;
1774
1791
  float rope_freq_scale_train;
1775
1792
  uint32_t n_yarn_orig_ctx;
@@ -1818,6 +1835,7 @@ struct llama_hparams {
1818
1835
 
1819
1836
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1820
1837
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1838
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1821
1839
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1822
1840
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1823
1841
 
@@ -1915,6 +1933,7 @@ struct llama_layer {
1915
1933
  struct ggml_tensor * ffn_norm_b;
1916
1934
  struct ggml_tensor * layer_out_norm;
1917
1935
  struct ggml_tensor * layer_out_norm_b;
1936
+ struct ggml_tensor * ffn_norm_exps;
1918
1937
 
1919
1938
  // ff
1920
1939
  struct ggml_tensor * ffn_gate; // w1
@@ -1952,6 +1971,10 @@ struct llama_layer {
1952
1971
  // mamba bias
1953
1972
  struct ggml_tensor * ssm_conv1d_b;
1954
1973
  struct ggml_tensor * ssm_dt_b;
1974
+
1975
+ // long rope factors
1976
+ struct ggml_tensor * rope_long = nullptr;
1977
+ struct ggml_tensor * rope_short = nullptr;
1955
1978
  };
1956
1979
 
1957
1980
  struct llama_kv_cell {
@@ -2268,10 +2291,6 @@ struct llama_context {
2268
2291
 
2269
2292
  // control vectors
2270
2293
  struct llama_control_vector cvec;
2271
-
2272
- #ifdef GGML_USE_MPI
2273
- ggml_mpi_context * ctx_mpi = NULL;
2274
- #endif
2275
2294
  };
2276
2295
 
2277
2296
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
@@ -2491,7 +2510,6 @@ static bool llama_kv_cache_init(
2491
2510
  static bool llama_kv_cache_find_slot(
2492
2511
  struct llama_kv_cache & cache,
2493
2512
  const struct llama_batch & batch) {
2494
- const uint32_t n_ctx = cache.size;
2495
2513
  const uint32_t n_tokens = batch.n_tokens;
2496
2514
 
2497
2515
  if (cache.recurrent) {
@@ -2542,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
2542
2560
  }
2543
2561
  // otherwise, one cell per token.
2544
2562
 
2545
- if (n_tokens > n_ctx) {
2546
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2563
+ if (n_tokens > cache.size) {
2564
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2547
2565
  return false;
2548
2566
  }
2549
2567
 
2550
2568
  uint32_t n_tested = 0;
2551
2569
 
2552
2570
  while (true) {
2553
- if (cache.head + n_tokens > n_ctx) {
2554
- n_tested += n_ctx - cache.head;
2571
+ if (cache.head + n_tokens > cache.size) {
2572
+ n_tested += cache.size - cache.head;
2555
2573
  cache.head = 0;
2556
2574
  continue;
2557
2575
  }
@@ -2570,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
2570
2588
  break;
2571
2589
  }
2572
2590
 
2573
- if (n_tested >= n_ctx) {
2591
+ if (n_tested >= cache.size) {
2574
2592
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2575
2593
  return false;
2576
2594
  }
@@ -3330,6 +3348,39 @@ struct llama_model_loader {
3330
3348
  return get_arr_n(llm_kv(kid), result, required);
3331
3349
  }
3332
3350
 
3351
+ template<typename T>
3352
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3353
+ const int kid = gguf_find_key(meta, key.c_str());
3354
+
3355
+ if (kid < 0) {
3356
+ if (required) {
3357
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3358
+ }
3359
+ return false;
3360
+ }
3361
+
3362
+ struct GGUFMeta::ArrayInfo arr_info =
3363
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3364
+
3365
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3366
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3367
+ }
3368
+
3369
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3370
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3371
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3372
+
3373
+ result.resize(arr_info.length);
3374
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3375
+
3376
+ return true;
3377
+ }
3378
+
3379
+ template<typename T>
3380
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3381
+ return get_arr(llm_kv(kid), result, required);
3382
+ }
3383
+
3333
3384
  template<typename T>
3334
3385
  bool get_key(const std::string & key, T & result, const bool required = true) {
3335
3386
  auto it = kv_overrides.find(key);
@@ -3404,11 +3455,15 @@ struct llama_model_loader {
3404
3455
  return get_tensor_meta(get_tensor_name(i));
3405
3456
  }
3406
3457
 
3407
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3458
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3408
3459
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3409
3460
  ggml_set_name(tensor, ggml_get_name(cur));
3410
3461
 
3411
- n_created++;
3462
+ if (duplicated) {
3463
+ size_data += ggml_nbytes(cur);
3464
+ } else {
3465
+ n_created++;
3466
+ }
3412
3467
 
3413
3468
  return tensor;
3414
3469
  }
@@ -3443,14 +3498,17 @@ struct llama_model_loader {
3443
3498
  return cur;
3444
3499
  }
3445
3500
 
3446
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3447
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3501
+ static const int TENSOR_NOT_REQUIRED = 1;
3502
+ static const int TENSOR_DUPLICATED = 2;
3503
+
3504
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3505
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3448
3506
 
3449
3507
  if (cur == NULL) {
3450
3508
  return NULL;
3451
3509
  }
3452
3510
 
3453
- return create_tensor_for(ctx, cur);
3511
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3454
3512
  }
3455
3513
 
3456
3514
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3750,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3750
3808
 
3751
3809
  static const char * llama_model_type_name(e_model type) {
3752
3810
  switch (type) {
3753
- case MODEL_22M: return "22M";
3754
- case MODEL_33M: return "33M";
3755
- case MODEL_109M: return "109M";
3756
- case MODEL_137M: return "137M";
3757
- case MODEL_0_5B: return "0.5B";
3758
- case MODEL_1B: return "1B";
3759
- case MODEL_2B: return "2B";
3760
- case MODEL_3B: return "3B";
3761
- case MODEL_7B: return "7B";
3762
- case MODEL_8B: return "8B";
3763
- case MODEL_12B: return "12B";
3764
- case MODEL_13B: return "13B";
3765
- case MODEL_14B: return "14B";
3766
- case MODEL_15B: return "15B";
3767
- case MODEL_20B: return "20B";
3768
- case MODEL_30B: return "30B";
3769
- case MODEL_34B: return "34B";
3770
- case MODEL_35B: return "35B";
3771
- case MODEL_40B: return "40B";
3772
- case MODEL_65B: return "65B";
3773
- case MODEL_70B: return "70B";
3774
- case MODEL_314B: return "314B";
3775
- case MODEL_SMALL: return "0.1B";
3776
- case MODEL_MEDIUM: return "0.4B";
3777
- case MODEL_LARGE: return "0.8B";
3778
- case MODEL_XL: return "1.5B";
3779
- case MODEL_A2_7B: return "A2.7B";
3780
- case MODEL_8x7B: return "8x7B";
3781
- case MODEL_8x22B: return "8x22B";
3782
- case MODEL_16x12B: return "16x12B";
3783
- default: return "?B";
3811
+ case MODEL_14M: return "14M";
3812
+ case MODEL_17M: return "17M";
3813
+ case MODEL_22M: return "22M";
3814
+ case MODEL_33M: return "33M";
3815
+ case MODEL_70M: return "70M";
3816
+ case MODEL_109M: return "109M";
3817
+ case MODEL_137M: return "137M";
3818
+ case MODEL_160M: return "160M";
3819
+ case MODEL_335M: return "335M";
3820
+ case MODEL_410M: return "410M";
3821
+ case MODEL_0_5B: return "0.5B";
3822
+ case MODEL_1B: return "1B";
3823
+ case MODEL_1_4B: return "1.4B";
3824
+ case MODEL_2B: return "2B";
3825
+ case MODEL_2_8B: return "2.8B";
3826
+ case MODEL_3B: return "3B";
3827
+ case MODEL_4B: return "4B";
3828
+ case MODEL_6_9B: return "6.9B";
3829
+ case MODEL_7B: return "7B";
3830
+ case MODEL_8B: return "8B";
3831
+ case MODEL_12B: return "12B";
3832
+ case MODEL_13B: return "13B";
3833
+ case MODEL_14B: return "14B";
3834
+ case MODEL_15B: return "15B";
3835
+ case MODEL_20B: return "20B";
3836
+ case MODEL_30B: return "30B";
3837
+ case MODEL_34B: return "34B";
3838
+ case MODEL_35B: return "35B";
3839
+ case MODEL_40B: return "40B";
3840
+ case MODEL_65B: return "65B";
3841
+ case MODEL_70B: return "70B";
3842
+ case MODEL_314B: return "314B";
3843
+ case MODEL_SMALL: return "0.1B";
3844
+ case MODEL_MEDIUM: return "0.4B";
3845
+ case MODEL_LARGE: return "0.8B";
3846
+ case MODEL_XL: return "1.5B";
3847
+ case MODEL_A2_7B: return "A2.7B";
3848
+ case MODEL_8x7B: return "8x7B";
3849
+ case MODEL_8x22B: return "8x22B";
3850
+ case MODEL_16x12B: return "16x12B";
3851
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3852
+ default: return "?B";
3784
3853
  }
3785
3854
  }
3786
3855
 
@@ -3873,6 +3942,8 @@ static void llm_load_hparams(
3873
3942
  }
3874
3943
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3875
3944
 
3945
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
3946
+
3876
3947
  // sanity check for n_rot (optional)
3877
3948
  {
3878
3949
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3972,14 +4043,6 @@ static void llm_load_hparams(
3972
4043
  default: model.type = e_model::MODEL_UNKNOWN;
3973
4044
  }
3974
4045
  } break;
3975
- case LLM_ARCH_PERSIMMON:
3976
- {
3977
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3978
- switch (hparams.n_layer) {
3979
- case 36: model.type = e_model::MODEL_8B; break;
3980
- default: model.type = e_model::MODEL_UNKNOWN;
3981
- }
3982
- } break;
3983
4046
  case LLM_ARCH_REFACT:
3984
4047
  {
3985
4048
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -4121,6 +4184,7 @@ static void llm_load_hparams(
4121
4184
  switch (hparams.n_layer) {
4122
4185
  case 24: model.type = e_model::MODEL_1B; break;
4123
4186
  case 32: model.type = e_model::MODEL_3B; break;
4187
+ case 40: model.type = e_model::MODEL_14B; break;
4124
4188
  default: model.type = e_model::MODEL_UNKNOWN;
4125
4189
  }
4126
4190
  } break;
@@ -4261,6 +4325,65 @@ static void llm_load_hparams(
4261
4325
  default: model.type = e_model::MODEL_UNKNOWN;
4262
4326
  }
4263
4327
  } break;
4328
+ case LLM_ARCH_GPTNEOX:
4329
+ {
4330
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4331
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4332
+ switch (hparams.n_layer) {
4333
+ case 6:
4334
+ switch (hparams.n_ff) {
4335
+ case 512: model.type = e_model::MODEL_14M; break;
4336
+ case 2048: model.type = e_model::MODEL_70M; break;
4337
+ default: model.type = e_model::MODEL_UNKNOWN;
4338
+ } break;
4339
+ case 12:
4340
+ switch (hparams.n_ff) {
4341
+ case 3072: model.type = e_model::MODEL_160M; break;
4342
+ default: model.type = e_model::MODEL_UNKNOWN;
4343
+ } break;
4344
+ case 16:
4345
+ switch (hparams.n_ff) {
4346
+ case 8192: model.type = e_model::MODEL_1B; break;
4347
+ default: model.type = e_model::MODEL_UNKNOWN;
4348
+ } break;
4349
+ case 24:
4350
+ switch (hparams.n_ff) {
4351
+ case 4096: model.type = e_model::MODEL_410M; break;
4352
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4353
+ default: model.type = e_model::MODEL_UNKNOWN;
4354
+ } break;
4355
+ case 32:
4356
+ switch (hparams.n_ff) {
4357
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4358
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4359
+ default: model.type = e_model::MODEL_UNKNOWN;
4360
+ } break;
4361
+ case 36:
4362
+ switch (hparams.n_ff) {
4363
+ case 20480: model.type = e_model::MODEL_12B; break;
4364
+ default: model.type = e_model::MODEL_UNKNOWN;
4365
+ } break;
4366
+ case 44:
4367
+ switch (hparams.n_ff) {
4368
+ case 24576: model.type = e_model::MODEL_20B; break;
4369
+ default: model.type = e_model::MODEL_UNKNOWN;
4370
+ } break;
4371
+ default: model.type = e_model::MODEL_UNKNOWN;
4372
+ }
4373
+ } break;
4374
+ case LLM_ARCH_ARCTIC:
4375
+ {
4376
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4377
+
4378
+ if (hparams.n_expert == 128) {
4379
+ switch (hparams.n_layer) {
4380
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4381
+ default: model.type = e_model::MODEL_UNKNOWN;
4382
+ }
4383
+ } else {
4384
+ model.type = e_model::MODEL_UNKNOWN;
4385
+ }
4386
+ } break;
4264
4387
  default: (void)0;
4265
4388
  }
4266
4389
 
@@ -4461,12 +4584,18 @@ static void llm_load_vocab(
4461
4584
  } else if (
4462
4585
  tokenizer_pre == "qwen2") {
4463
4586
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4587
+ } else if (
4588
+ tokenizer_pre == "stablelm2") {
4589
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4464
4590
  } else if (
4465
4591
  tokenizer_pre == "olmo") {
4466
4592
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4467
4593
  } else if (
4468
4594
  tokenizer_pre == "dbrx") {
4469
4595
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4596
+ } else if (
4597
+ tokenizer_pre == "smaug-bpe") {
4598
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4470
4599
  } else {
4471
4600
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4472
4601
  }
@@ -4582,7 +4711,8 @@ static void llm_load_vocab(
4582
4711
  (t.first == "<|eot_id|>" ||
4583
4712
  t.first == "<|im_end|>" ||
4584
4713
  t.first == "<|end|>" ||
4585
- t.first == "<end_of_turn>"
4714
+ t.first == "<end_of_turn>" ||
4715
+ t.first == "<|endoftext|>"
4586
4716
  )
4587
4717
  ) {
4588
4718
  vocab.special_eot_id = t.second;
@@ -4908,6 +5038,7 @@ static bool llm_load_tensors(
4908
5038
  // create tensors for the weights
4909
5039
  {
4910
5040
  const int64_t n_embd = hparams.n_embd;
5041
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4911
5042
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4912
5043
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4913
5044
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4942,12 +5073,10 @@ static bool llm_load_tensors(
4942
5073
  {
4943
5074
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4944
5075
  if (model.arch != LLM_ARCH_MINICPM){
4945
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5076
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4946
5077
  // if output is NULL, init from the input tok embed
4947
5078
  if (model.output == NULL) {
4948
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4949
- ml.n_created--; // artificial tensor
4950
- ml.size_data += ggml_nbytes(model.output);
5079
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4951
5080
  }
4952
5081
  }
4953
5082
  }
@@ -4966,10 +5095,10 @@ static bool llm_load_tensors(
4966
5095
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4967
5096
 
4968
5097
  // optional bias tensors
4969
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4970
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4971
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4972
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5098
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5099
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5100
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5101
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4973
5102
 
4974
5103
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4975
5104
 
@@ -4980,7 +5109,7 @@ static bool llm_load_tensors(
4980
5109
  } else {
4981
5110
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4982
5111
 
4983
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5112
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4984
5113
  if (layer.ffn_gate_exps) {
4985
5114
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4986
5115
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5022,12 +5151,10 @@ static bool llm_load_tensors(
5022
5151
  // output
5023
5152
  {
5024
5153
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5025
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5154
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5026
5155
  // if output is NULL, init from the input tok embed
5027
5156
  if (model.output == NULL) {
5028
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5029
- ml.n_created--; // artificial tensor
5030
- ml.size_data += ggml_nbytes(model.output);
5157
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5031
5158
  }
5032
5159
  }
5033
5160
 
@@ -5050,7 +5177,7 @@ static bool llm_load_tensors(
5050
5177
 
5051
5178
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5052
5179
 
5053
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5180
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
5054
5181
  if (layer.ffn_gate_exps) {
5055
5182
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
5056
5183
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5152,11 +5279,9 @@ static bool llm_load_tensors(
5152
5279
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5153
5280
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5154
5281
 
5155
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5282
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5156
5283
  if (!model.output) {
5157
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5158
- ml.n_created--; // artificial tensor
5159
- ml.size_data += ggml_nbytes(model.output);
5284
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5160
5285
  }
5161
5286
  }
5162
5287
 
@@ -5169,8 +5294,8 @@ static bool llm_load_tensors(
5169
5294
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5170
5295
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5171
5296
 
5172
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5173
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5297
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5298
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5174
5299
 
5175
5300
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5176
5301
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5188,7 +5313,12 @@ static bool llm_load_tensors(
5188
5313
  {
5189
5314
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5190
5315
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5191
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5316
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5317
+ if (!model.output) {
5318
+ // needs to be on GPU
5319
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5320
+ }
5321
+
5192
5322
  }
5193
5323
 
5194
5324
  for (int i = 0; i < n_layer; ++i) {
@@ -5216,47 +5346,6 @@ static bool llm_load_tensors(
5216
5346
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5217
5347
  }
5218
5348
  } break;
5219
- case LLM_ARCH_PERSIMMON:
5220
- {
5221
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5222
-
5223
- {
5224
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5225
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5226
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5227
- }
5228
-
5229
- for (int i = 0; i < n_layer; ++i) {
5230
- ggml_context * ctx_layer = ctx_for_layer(i);
5231
- ggml_context * ctx_split = ctx_for_layer_split(i);
5232
-
5233
- auto & layer = model.layers[i];
5234
-
5235
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5236
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5237
-
5238
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5239
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5240
-
5241
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5242
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5243
-
5244
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5245
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5246
-
5247
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5248
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5249
-
5250
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5251
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5252
-
5253
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5254
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5255
-
5256
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5257
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5258
- }
5259
- } break;
5260
5349
  case LLM_ARCH_BERT:
5261
5350
  case LLM_ARCH_NOMIC_BERT:
5262
5351
  {
@@ -5325,14 +5414,14 @@ static bool llm_load_tensors(
5325
5414
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
5415
  layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
5416
 
5328
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5417
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5418
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5419
 
5331
5420
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
5421
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
5422
 
5334
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5423
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5424
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5336
5425
 
5337
5426
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
5427
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
@@ -5394,18 +5483,16 @@ static bool llm_load_tensors(
5394
5483
  case LLM_ARCH_MPT:
5395
5484
  {
5396
5485
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5397
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5486
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5398
5487
 
5399
5488
  // output
5400
5489
  {
5401
5490
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5402
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5491
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5403
5492
 
5404
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5493
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5405
5494
  if (!model.output) {
5406
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5407
- ml.n_created--; // artificial tensor
5408
- ml.size_data += ggml_nbytes(model.output);
5495
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5409
5496
  }
5410
5497
  }
5411
5498
 
@@ -5416,31 +5503,31 @@ static bool llm_load_tensors(
5416
5503
  auto & layer = model.layers[i];
5417
5504
 
5418
5505
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5419
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5506
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5420
5507
 
5421
5508
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5422
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5509
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5423
5510
 
5424
5511
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5425
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5512
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5426
5513
 
5427
5514
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5428
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5515
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5429
5516
 
5430
5517
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5431
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5518
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5432
5519
 
5433
5520
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5434
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5521
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5435
5522
 
5436
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5437
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5523
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5524
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5438
5525
 
5439
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5440
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5526
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5527
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5441
5528
 
5442
5529
  // AWQ ScaleActivation layer
5443
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5530
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5444
5531
  }
5445
5532
  } break;
5446
5533
  case LLM_ARCH_STABLELM:
@@ -5469,17 +5556,17 @@ static bool llm_load_tensors(
5469
5556
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5470
5557
 
5471
5558
  // optional bias tensors, present in Stable LM 2 1.6B
5472
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5473
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5474
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5559
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5560
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5561
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5475
5562
 
5476
5563
  // optional q and k layernorms, present in StableLM 2 12B
5477
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5478
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5564
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5565
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5479
5566
 
5480
5567
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5481
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5482
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5568
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5569
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5483
5570
 
5484
5571
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5485
5572
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5522,12 +5609,10 @@ static bool llm_load_tensors(
5522
5609
  // output
5523
5610
  {
5524
5611
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5525
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5612
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5526
5613
  // if output is NULL, init from the input tok embed
5527
5614
  if (model.output == NULL) {
5528
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5529
- ml.n_created--; // artificial tensor
5530
- ml.size_data += ggml_nbytes(model.output);
5615
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5531
5616
  }
5532
5617
  }
5533
5618
 
@@ -5625,8 +5710,8 @@ static bool llm_load_tensors(
5625
5710
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5626
5711
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5627
5712
 
5628
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5629
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5713
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5714
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5630
5715
 
5631
5716
  if (layer.wqkv == nullptr) {
5632
5717
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5663,17 +5748,20 @@ static bool llm_load_tensors(
5663
5748
  ggml_context* ctx_layer = ctx_for_layer(i);
5664
5749
  ggml_context* ctx_split = ctx_for_layer_split(i);
5665
5750
 
5666
- auto& layer = model.layers[i];
5751
+ auto & layer = model.layers[i];
5667
5752
 
5668
5753
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5669
5754
 
5670
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5671
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5755
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5756
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5672
5757
 
5673
5758
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5674
5759
 
5675
5760
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5676
5761
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5762
+
5763
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5764
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5677
5765
  }
5678
5766
  } break;
5679
5767
  case LLM_ARCH_PLAMO:
@@ -5842,9 +5930,7 @@ static bool llm_load_tensors(
5842
5930
 
5843
5931
  // output
5844
5932
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5845
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5846
- ml.n_created--; // artificial tensor
5847
- ml.size_data += ggml_nbytes(model.output);
5933
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5848
5934
 
5849
5935
  const int64_t n_ff = hparams.n_ff;
5850
5936
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5879,12 +5965,10 @@ static bool llm_load_tensors(
5879
5965
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5880
5966
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5881
5967
 
5882
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5968
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5883
5969
  // if output is NULL, init from the input tok embed
5884
5970
  if (model.output == NULL) {
5885
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5886
- ml.n_created--; // artificial tensor
5887
- ml.size_data += ggml_nbytes(model.output);
5971
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5888
5972
  }
5889
5973
 
5890
5974
  }
@@ -5935,12 +6019,10 @@ static bool llm_load_tensors(
5935
6019
  {
5936
6020
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5937
6021
 
5938
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6022
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5939
6023
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5940
6024
  if (model.output == NULL) {
5941
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5942
- ml.n_created--; // artificial tensor
5943
- ml.size_data += ggml_nbytes(model.output);
6025
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5944
6026
  }
5945
6027
  }
5946
6028
 
@@ -6001,9 +6083,7 @@ static bool llm_load_tensors(
6001
6083
  {
6002
6084
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6003
6085
  // init output from the input tok embed
6004
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6005
- ml.n_created--; // artificial tensor
6006
- ml.size_data += ggml_nbytes(model.output);
6086
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6007
6087
  }
6008
6088
 
6009
6089
  for (int i = 0; i < n_layer; ++i) {
@@ -6035,12 +6115,10 @@ static bool llm_load_tensors(
6035
6115
 
6036
6116
  // output
6037
6117
  {
6038
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6118
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6039
6119
  // if output is NULL, init from the input tok embed
6040
6120
  if (model.output == NULL) {
6041
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6042
- ml.n_created--; // artificial tensor
6043
- ml.size_data += ggml_nbytes(model.output);
6121
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6044
6122
  }
6045
6123
  }
6046
6124
 
@@ -6060,6 +6138,81 @@ static bool llm_load_tensors(
6060
6138
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6061
6139
  }
6062
6140
  } break;
6141
+ case LLM_ARCH_GPTNEOX:
6142
+ {
6143
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6144
+ // output
6145
+ {
6146
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6147
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6148
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6149
+ }
6150
+
6151
+ for (int i = 0; i < n_layer; ++i) {
6152
+ ggml_context * ctx_layer = ctx_for_layer(i);
6153
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6154
+
6155
+ auto & layer = model.layers[i];
6156
+
6157
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6158
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6159
+
6160
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6161
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6162
+
6163
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6164
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6165
+
6166
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6167
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6168
+
6169
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6170
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6171
+
6172
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6173
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6174
+ }
6175
+ } break;
6176
+ case LLM_ARCH_ARCTIC:
6177
+ {
6178
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6179
+
6180
+ // output
6181
+ {
6182
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6183
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6184
+ // if output is NULL, init from the input tok embed
6185
+ if (model.output == NULL) {
6186
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6187
+ }
6188
+ }
6189
+
6190
+ for (int i = 0; i < n_layer; ++i) {
6191
+ ggml_context * ctx_layer = ctx_for_layer(i);
6192
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6193
+
6194
+ auto & layer = model.layers[i];
6195
+
6196
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6197
+
6198
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6199
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6200
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6201
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6202
+
6203
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6204
+
6205
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6206
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6207
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6208
+
6209
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6210
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6211
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6212
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6213
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6214
+ }
6215
+ } break;
6063
6216
  default:
6064
6217
  throw std::runtime_error("unknown architecture");
6065
6218
  }
@@ -6324,10 +6477,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6324
6477
 
6325
6478
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6326
6479
  } else {
6327
- #ifdef GGML_USE_MPI
6328
- GGML_ASSERT(false && "not implemented");
6329
- #endif
6330
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6480
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6331
6481
  inpL = lctx.inp_embd;
6332
6482
  ggml_set_input(lctx.inp_embd);
6333
6483
  }
@@ -6622,6 +6772,7 @@ static struct ggml_tensor * llm_build_kqv(
6622
6772
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6623
6773
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6624
6774
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6775
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6625
6776
 
6626
6777
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6627
6778
  cb(q, "q", il);
@@ -6644,23 +6795,23 @@ static struct ggml_tensor * llm_build_kqv(
6644
6795
  struct ggml_tensor * v =
6645
6796
  ggml_view_3d(ctx, kv.v_l[il],
6646
6797
  n_embd_head_v, n_kv, n_head_kv,
6647
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6648
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6798
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6799
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6649
6800
  0);
6650
6801
  cb(v, "v", il);
6651
6802
 
6652
6803
  cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6653
6804
 
6654
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6805
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6655
6806
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6656
6807
  }
6657
6808
 
6658
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6809
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6659
6810
  } else {
6660
6811
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6661
6812
  cb(kq, "kq", il);
6662
6813
 
6663
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6814
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6664
6815
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6665
6816
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6666
6817
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6700,7 +6851,7 @@ static struct ggml_tensor * llm_build_kqv(
6700
6851
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6701
6852
  cb(kqv_merged, "kqv_merged", il);
6702
6853
 
6703
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6854
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6704
6855
  cb(cur, "kqv_merged_cont", il);
6705
6856
  }
6706
6857
 
@@ -6885,17 +7036,20 @@ struct llm_build_context {
6885
7036
  cb(lctx.inp_K_shift, "K_shift", -1);
6886
7037
  ggml_set_input(lctx.inp_K_shift);
6887
7038
 
7039
+
6888
7040
  for (int il = 0; il < n_layer; ++il) {
7041
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6889
7042
  struct ggml_tensor * tmp =
6890
7043
  // we rotate only the first n_rot dimensions
6891
- ggml_rope_custom_inplace(ctx0,
7044
+ ggml_rope_ext_inplace(ctx0,
6892
7045
  ggml_view_3d(ctx0, kv_self.k_l[il],
6893
7046
  n_embd_head_k, n_head_kv, n_ctx,
6894
7047
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6895
7048
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6896
7049
  0),
6897
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7050
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6898
7051
  ext_factor, attn_factor, beta_fast, beta_slow);
7052
+
6899
7053
  cb(tmp, "K_shifted", il);
6900
7054
  ggml_build_forward_expand(gf, tmp);
6901
7055
  }
@@ -6998,6 +7152,17 @@ struct llm_build_context {
6998
7152
  return lctx.inp_pos;
6999
7153
  }
7000
7154
 
7155
+ struct ggml_tensor * build_rope_factors(int il) {
7156
+ // choose long/short freq factors based on the context size
7157
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7158
+
7159
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7160
+ return model.layers[il].rope_long;
7161
+ }
7162
+
7163
+ return model.layers[il].rope_short;
7164
+ }
7165
+
7001
7166
  struct ggml_tensor * build_inp_out_ids() {
7002
7167
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
7003
7168
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -7105,15 +7270,15 @@ struct llm_build_context {
7105
7270
  cb(Vcur, "Vcur", il);
7106
7271
  }
7107
7272
 
7108
- Qcur = ggml_rope_custom(
7109
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7273
+ Qcur = ggml_rope_ext(
7274
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7110
7275
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7111
7276
  ext_factor, attn_factor, beta_fast, beta_slow
7112
7277
  );
7113
7278
  cb(Qcur, "Qcur", il);
7114
7279
 
7115
- Kcur = ggml_rope_custom(
7116
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7280
+ Kcur = ggml_rope_ext(
7281
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7117
7282
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7118
7283
  ext_factor, attn_factor, beta_fast, beta_slow
7119
7284
  );
@@ -7235,13 +7400,13 @@ struct llm_build_context {
7235
7400
 
7236
7401
  switch (model.type) {
7237
7402
  case MODEL_7B:
7238
- Qcur = ggml_rope_custom(
7239
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7403
+ Qcur = ggml_rope_ext(
7404
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7240
7405
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7241
7406
  ext_factor, attn_factor, beta_fast, beta_slow
7242
7407
  );
7243
- Kcur = ggml_rope_custom(
7244
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7408
+ Kcur = ggml_rope_ext(
7409
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7245
7410
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7246
7411
  ext_factor, attn_factor, beta_fast, beta_slow
7247
7412
  );
@@ -7347,15 +7512,15 @@ struct llm_build_context {
7347
7512
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7348
7513
  cb(Vcur, "Vcur", il);
7349
7514
 
7350
- Qcur = ggml_rope_custom(
7351
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7515
+ Qcur = ggml_rope_ext(
7516
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7352
7517
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7353
7518
  ext_factor, attn_factor, beta_fast, beta_slow
7354
7519
  );
7355
7520
  cb(Qcur, "Qcur", il);
7356
7521
 
7357
- Kcur = ggml_rope_custom(
7358
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7522
+ Kcur = ggml_rope_ext(
7523
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7359
7524
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7360
7525
  ext_factor, attn_factor, beta_fast, beta_slow
7361
7526
  );
@@ -7468,14 +7633,14 @@ struct llm_build_context {
7468
7633
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7469
7634
 
7470
7635
  // using mode = 2 for neox mode
7471
- Qcur = ggml_rope_custom(
7472
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7636
+ Qcur = ggml_rope_ext(
7637
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7473
7638
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7474
7639
  );
7475
7640
  cb(Qcur, "Qcur", il);
7476
7641
 
7477
- Kcur = ggml_rope_custom(
7478
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7642
+ Kcur = ggml_rope_ext(
7643
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7479
7644
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7480
7645
  );
7481
7646
  cb(Kcur, "Kcur", il);
@@ -7591,15 +7756,15 @@ struct llm_build_context {
7591
7756
  cb(Vcur, "Vcur", il);
7592
7757
  }
7593
7758
 
7594
- Qcur = ggml_rope_custom(
7595
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7759
+ Qcur = ggml_rope_ext(
7760
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7596
7761
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7597
7762
  ext_factor, attn_factor, beta_fast, beta_slow
7598
7763
  );
7599
7764
  cb(Qcur, "Qcur", il);
7600
7765
 
7601
- Kcur = ggml_rope_custom(
7602
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7766
+ Kcur = ggml_rope_ext(
7767
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7603
7768
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7604
7769
  ext_factor, attn_factor, beta_fast, beta_slow
7605
7770
  );
@@ -7743,15 +7908,15 @@ struct llm_build_context {
7743
7908
  cb(Kcur, "Kcur", il);
7744
7909
  cb(Vcur, "Vcur", il);
7745
7910
 
7746
- Qcur = ggml_rope_custom(
7747
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7911
+ Qcur = ggml_rope_ext(
7912
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7748
7913
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7749
7914
  ext_factor, attn_factor, beta_fast, beta_slow
7750
7915
  );
7751
7916
  cb(Qcur, "Qcur", il);
7752
7917
 
7753
- Kcur = ggml_rope_custom(
7754
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7918
+ Kcur = ggml_rope_ext(
7919
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7755
7920
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7756
7921
  ext_factor, attn_factor, beta_fast, beta_slow
7757
7922
  );
@@ -7920,256 +8085,49 @@ struct llm_build_context {
7920
8085
  return gf;
7921
8086
  }
7922
8087
 
7923
- struct ggml_cgraph * build_persimmon() {
8088
+ struct ggml_cgraph * build_refact() {
7924
8089
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7925
8090
 
7926
8091
  const int64_t n_embd_head = hparams.n_embd_head_v;
7927
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7928
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
8092
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7929
8093
 
7930
8094
  struct ggml_tensor * cur;
7931
8095
  struct ggml_tensor * inpL;
7932
8096
 
7933
8097
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7934
8098
 
7935
- // inp_pos - contains the positions
7936
- struct ggml_tensor * inp_pos = build_inp_pos();
7937
-
7938
8099
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7939
8100
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7940
8101
 
7941
8102
  for (int il = 0; il < n_layer; ++il) {
7942
- struct ggml_tensor * residual = inpL;
8103
+ struct ggml_tensor * inpSA = inpL;
7943
8104
 
7944
8105
  cur = llm_build_norm(ctx0, inpL, hparams,
7945
- model.layers[il].attn_norm,
7946
- model.layers[il].attn_norm_b,
7947
- LLM_NORM, cb, il);
8106
+ model.layers[il].attn_norm, NULL,
8107
+ LLM_NORM_RMS, cb, il);
7948
8108
  cb(cur, "attn_norm", il);
7949
8109
 
7950
- // self attention
8110
+ // self-attention
7951
8111
  {
7952
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7953
- cb(cur, "wqkv", il);
8112
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8113
+ cb(Qcur, "Qcur", il);
7954
8114
 
7955
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7956
- cb(cur, "bqkv", il);
8115
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8116
+ cb(Kcur, "Kcur", il);
7957
8117
 
7958
- // split qkv
7959
- GGML_ASSERT(n_head_kv == n_head);
8118
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8119
+ cb(Vcur, "Vcur", il);
7960
8120
 
7961
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7962
- cb(tmpqkv, "tmpqkv", il);
8121
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8122
+ cb(Kcur, "Kcur", il);
7963
8123
 
7964
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7965
- cb(tmpqkv_perm, "tmpqkv", il);
8124
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8125
+ cb(Qcur, "Qcur", il);
7966
8126
 
7967
- struct ggml_tensor * tmpq = ggml_view_3d(
7968
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7969
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7970
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7971
- 0
7972
- );
7973
- cb(tmpq, "tmpq", il);
7974
-
7975
- struct ggml_tensor * tmpk = ggml_view_3d(
7976
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7977
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7978
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7979
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7980
- );
7981
- cb(tmpk, "tmpk", il);
7982
-
7983
- // Q/K Layernorm
7984
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7985
- model.layers[il].attn_q_norm,
7986
- model.layers[il].attn_q_norm_b,
7987
- LLM_NORM, cb, il);
7988
- cb(tmpq, "tmpq", il);
7989
-
7990
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7991
- model.layers[il].attn_k_norm,
7992
- model.layers[il].attn_k_norm_b,
7993
- LLM_NORM, cb, il);
7994
- cb(tmpk, "tmpk", il);
7995
-
7996
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7997
- struct ggml_tensor * qrot = ggml_view_3d(
7998
- ctx0, tmpq, n_rot, n_head, n_tokens,
7999
- ggml_element_size(tmpq) * n_embd_head,
8000
- ggml_element_size(tmpq) * n_embd_head * n_head,
8001
- 0
8002
- );
8003
- cb(qrot, "qrot", il);
8004
-
8005
- struct ggml_tensor * krot = ggml_view_3d(
8006
- ctx0, tmpk, n_rot, n_head, n_tokens,
8007
- ggml_element_size(tmpk) * n_embd_head,
8008
- ggml_element_size(tmpk) * n_embd_head * n_head,
8009
- 0
8010
- );
8011
- cb(krot, "krot", il);
8012
-
8013
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
8014
- struct ggml_tensor * qpass = ggml_view_3d(
8015
- ctx0, tmpq, n_rot, n_head, n_tokens,
8016
- ggml_element_size(tmpq) * n_embd_head,
8017
- ggml_element_size(tmpq) * n_embd_head * n_head,
8018
- ggml_element_size(tmpq) * n_rot
8019
- );
8020
- cb(qpass, "qpass", il);
8021
-
8022
- struct ggml_tensor * kpass = ggml_view_3d(
8023
- ctx0, tmpk, n_rot, n_head, n_tokens,
8024
- ggml_element_size(tmpk) * n_embd_head,
8025
- ggml_element_size(tmpk) * n_embd_head * n_head,
8026
- ggml_element_size(tmpk) * n_rot
8027
- );
8028
- cb(kpass, "kpass", il);
8029
-
8030
- struct ggml_tensor * qrotated = ggml_rope_custom(
8031
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8032
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8033
- );
8034
- cb(qrotated, "qrotated", il);
8035
-
8036
- struct ggml_tensor * krotated = ggml_rope_custom(
8037
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8038
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8039
- );
8040
- cb(krotated, "krotated", il);
8041
-
8042
- // ggml currently only supports concatenation on dim=2
8043
- // so we need to permute qrot, qpass, concat, then permute back.
8044
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
8045
- cb(qrotated, "qrotated", il);
8046
-
8047
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
8048
- cb(krotated, "krotated", il);
8049
-
8050
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
8051
- cb(qpass, "qpass", il);
8052
-
8053
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
8054
- cb(kpass, "kpass", il);
8055
-
8056
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
8057
- cb(Qcur, "Qcur", il);
8058
-
8059
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
8060
- cb(Kcur, "Kcur", il);
8061
-
8062
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
8063
- cb(Q, "Q", il);
8064
-
8065
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8066
- cb(Kcur, "Kcur", il);
8067
-
8068
- struct ggml_tensor * Vcur = ggml_view_3d(
8069
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8070
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8071
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8072
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8073
- );
8074
- cb(Vcur, "Vcur", il);
8075
-
8076
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8077
- model.layers[il].wo, model.layers[il].bo,
8078
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8079
- }
8080
-
8081
- if (il == n_layer - 1) {
8082
- // skip computing output for unused tokens
8083
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8084
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8085
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8086
- }
8087
-
8088
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8089
- cb(ffn_inp, "ffn_inp", il);
8090
-
8091
- // feed-forward network
8092
- {
8093
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8094
- model.layers[il].ffn_norm,
8095
- model.layers[il].ffn_norm_b,
8096
- LLM_NORM, cb, il);
8097
- cb(cur, "ffn_norm", il);
8098
-
8099
- cur = llm_build_ffn(ctx0, cur,
8100
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8101
- NULL, NULL,
8102
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8103
- NULL,
8104
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8105
- cb(cur, "ffn_out", il);
8106
- }
8107
-
8108
- cur = ggml_add(ctx0, cur, ffn_inp);
8109
- cb(cur, "l_out", il);
8110
-
8111
- inpL = cur;
8112
- }
8113
-
8114
- cur = inpL;
8115
-
8116
- cur = llm_build_norm(ctx0, cur, hparams,
8117
- model.output_norm,
8118
- model.output_norm_b,
8119
- LLM_NORM, cb, -1);
8120
- cb(cur, "result_norm", -1);
8121
-
8122
- cur = ggml_mul_mat(ctx0, model.output, cur);
8123
- cb(cur, "result_output", -1);
8124
-
8125
- ggml_build_forward_expand(gf, cur);
8126
-
8127
- return gf;
8128
- }
8129
-
8130
- struct ggml_cgraph * build_refact() {
8131
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8132
-
8133
- const int64_t n_embd_head = hparams.n_embd_head_v;
8134
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8135
-
8136
- struct ggml_tensor * cur;
8137
- struct ggml_tensor * inpL;
8138
-
8139
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8140
-
8141
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8142
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8143
-
8144
- for (int il = 0; il < n_layer; ++il) {
8145
- struct ggml_tensor * inpSA = inpL;
8146
-
8147
- cur = llm_build_norm(ctx0, inpL, hparams,
8148
- model.layers[il].attn_norm, NULL,
8149
- LLM_NORM_RMS, cb, il);
8150
- cb(cur, "attn_norm", il);
8151
-
8152
- // self-attention
8153
- {
8154
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8155
- cb(Qcur, "Qcur", il);
8156
-
8157
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8158
- cb(Kcur, "Kcur", il);
8159
-
8160
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8161
- cb(Vcur, "Vcur", il);
8162
-
8163
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8164
- cb(Kcur, "Kcur", il);
8165
-
8166
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8167
- cb(Qcur, "Qcur", il);
8168
-
8169
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8170
- model.layers[il].wo, NULL,
8171
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8172
- }
8127
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8128
+ model.layers[il].wo, NULL,
8129
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8130
+ }
8173
8131
 
8174
8132
  if (il == n_layer - 1) {
8175
8133
  // skip computing output for unused tokens
@@ -8303,15 +8261,15 @@ struct llm_build_context {
8303
8261
  cb(Kcur, "Kcur", il);
8304
8262
  cb(Vcur, "Vcur", il);
8305
8263
 
8306
- Qcur = ggml_rope_custom(
8307
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8264
+ Qcur = ggml_rope_ext(
8265
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8308
8266
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8309
8267
  ext_factor, attn_factor, beta_fast, beta_slow
8310
8268
  );
8311
8269
  cb(Qcur, "Qcur", il);
8312
8270
 
8313
- Kcur = ggml_rope_custom(
8314
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8271
+ Kcur = ggml_rope_ext(
8272
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8315
8273
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8316
8274
  ext_factor, attn_factor, beta_fast, beta_slow
8317
8275
  );
@@ -8743,15 +8701,15 @@ struct llm_build_context {
8743
8701
  }
8744
8702
 
8745
8703
 
8746
- Qcur = ggml_rope_custom(
8747
- ctx0, Qcur, inp_pos,
8704
+ Qcur = ggml_rope_ext(
8705
+ ctx0, Qcur, inp_pos, nullptr,
8748
8706
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8749
8707
  ext_factor, attn_factor, beta_fast, beta_slow
8750
8708
  );
8751
8709
  cb(Qcur, "Qcur", il);
8752
8710
 
8753
- Kcur = ggml_rope_custom(
8754
- ctx0, Kcur, inp_pos,
8711
+ Kcur = ggml_rope_ext(
8712
+ ctx0, Kcur, inp_pos, nullptr,
8755
8713
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8756
8714
  ext_factor, attn_factor, beta_fast, beta_slow
8757
8715
  );
@@ -8863,14 +8821,14 @@ struct llm_build_context {
8863
8821
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8864
8822
 
8865
8823
  // using mode = 2 for neox mode
8866
- Qcur = ggml_rope_custom(
8867
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8824
+ Qcur = ggml_rope_ext(
8825
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8868
8826
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8869
8827
  );
8870
8828
  cb(Qcur, "Qcur", il);
8871
8829
 
8872
- Kcur = ggml_rope_custom(
8873
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8830
+ Kcur = ggml_rope_ext(
8831
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8874
8832
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8875
8833
  );
8876
8834
  cb(Kcur, "Kcur", il);
@@ -8974,15 +8932,15 @@ struct llm_build_context {
8974
8932
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8975
8933
  cb(Vcur, "Vcur", il);
8976
8934
 
8977
- Qcur = ggml_rope_custom(
8978
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8935
+ Qcur = ggml_rope_ext(
8936
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8979
8937
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8980
8938
  ext_factor, attn_factor, beta_fast, beta_slow
8981
8939
  );
8982
8940
  cb(Qcur, "Qcur", il);
8983
8941
 
8984
- Kcur = ggml_rope_custom(
8985
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8942
+ Kcur = ggml_rope_ext(
8943
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8986
8944
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8987
8945
  ext_factor, attn_factor, beta_fast, beta_slow
8988
8946
  );
@@ -9088,15 +9046,15 @@ struct llm_build_context {
9088
9046
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9089
9047
  cb(Vcur, "Vcur", il);
9090
9048
 
9091
- Qcur = ggml_rope_custom(
9092
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9049
+ Qcur = ggml_rope_ext(
9050
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9093
9051
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9094
9052
  ext_factor, attn_factor, beta_fast, beta_slow
9095
9053
  );
9096
9054
  cb(Qcur, "Qcur", il);
9097
9055
 
9098
- Kcur = ggml_rope_custom(
9099
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9056
+ Kcur = ggml_rope_ext(
9057
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9100
9058
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9101
9059
  ext_factor, attn_factor, beta_fast, beta_slow
9102
9060
  );
@@ -9240,8 +9198,8 @@ struct llm_build_context {
9240
9198
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9241
9199
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9242
9200
 
9243
- Qcur = ggml_rope_custom(
9244
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9201
+ Qcur = ggml_rope_ext(
9202
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9245
9203
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9246
9204
  );
9247
9205
  cb(Qcur, "Qcur", il);
@@ -9251,8 +9209,8 @@ struct llm_build_context {
9251
9209
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9252
9210
  cb(Qcur, "Qcur", il);
9253
9211
 
9254
- Kcur = ggml_rope_custom(
9255
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9212
+ Kcur = ggml_rope_ext(
9213
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9256
9214
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9257
9215
  );
9258
9216
  cb(Kcur, "Kcur", il);
@@ -9328,6 +9286,9 @@ struct llm_build_context {
9328
9286
 
9329
9287
  // self-attention
9330
9288
  {
9289
+ // rope freq factors for 128k context
9290
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9291
+
9331
9292
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9332
9293
  model.layers[il].attn_norm,
9333
9294
  NULL,
@@ -9359,8 +9320,8 @@ struct llm_build_context {
9359
9320
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9360
9321
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9361
9322
 
9362
- Qcur = ggml_rope_custom(
9363
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9323
+ Qcur = ggml_rope_ext(
9324
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9364
9325
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9365
9326
  );
9366
9327
  cb(Qcur, "Qcur", il);
@@ -9368,8 +9329,8 @@ struct llm_build_context {
9368
9329
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9369
9330
  cb(Qcur, "Qcur", il);
9370
9331
 
9371
- Kcur = ggml_rope_custom(
9372
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9332
+ Kcur = ggml_rope_ext(
9333
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9373
9334
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9374
9335
  );
9375
9336
  cb(Kcur, "Kcur", il);
@@ -9475,14 +9436,14 @@ struct llm_build_context {
9475
9436
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9476
9437
  cb(Vcur, "Vcur", il);
9477
9438
 
9478
- Qcur = ggml_rope_custom(
9479
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9439
+ Qcur = ggml_rope_ext(
9440
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9480
9441
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9481
9442
  ext_factor, attn_factor, beta_fast, beta_slow);
9482
9443
  cb(Qcur, "Qcur", il);
9483
9444
 
9484
- Kcur = ggml_rope_custom(
9485
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9445
+ Kcur = ggml_rope_ext(
9446
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9486
9447
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9487
9448
  ext_factor, attn_factor, beta_fast, beta_slow);
9488
9449
  cb(Kcur, "Kcur", il);
@@ -9683,15 +9644,15 @@ struct llm_build_context {
9683
9644
  cb(tmpk, "tmpk", il);
9684
9645
  cb(Vcur, "Vcur", il);
9685
9646
 
9686
- struct ggml_tensor * Qcur = ggml_rope_custom(
9687
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9647
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9648
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9688
9649
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9689
9650
  ext_factor, attn_factor, beta_fast, beta_slow
9690
9651
  );
9691
9652
  cb(Qcur, "Qcur", il);
9692
9653
 
9693
- struct ggml_tensor * Kcur = ggml_rope_custom(
9694
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9654
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9655
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9695
9656
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9696
9657
  ext_factor, attn_factor, beta_fast, beta_slow
9697
9658
  );
@@ -9799,15 +9760,15 @@ struct llm_build_context {
9799
9760
  // cb(Vcur, "Vcur", il);
9800
9761
  // }
9801
9762
 
9802
- Qcur = ggml_rope_custom(
9803
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9763
+ Qcur = ggml_rope_ext(
9764
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9804
9765
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9805
9766
  ext_factor, attn_factor, beta_fast, beta_slow
9806
9767
  );
9807
9768
  cb(Qcur, "Qcur", il);
9808
9769
 
9809
- Kcur = ggml_rope_custom(
9810
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9770
+ Kcur = ggml_rope_ext(
9771
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9811
9772
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9812
9773
  ext_factor, attn_factor, beta_fast, beta_slow
9813
9774
  );
@@ -9916,15 +9877,15 @@ struct llm_build_context {
9916
9877
  cb(Vcur, "Vcur", il);
9917
9878
  }
9918
9879
 
9919
- Qcur = ggml_rope_custom(
9920
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9880
+ Qcur = ggml_rope_ext(
9881
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9921
9882
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9922
9883
  ext_factor, attn_factor, beta_fast, beta_slow
9923
9884
  );
9924
9885
  cb(Qcur, "Qcur", il);
9925
9886
 
9926
- Kcur = ggml_rope_custom(
9927
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9887
+ Kcur = ggml_rope_ext(
9888
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9928
9889
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9929
9890
  ext_factor, attn_factor, beta_fast, beta_slow
9930
9891
  );
@@ -10046,15 +10007,15 @@ struct llm_build_context {
10046
10007
  cb(Vcur, "Vcur", il);
10047
10008
  }
10048
10009
 
10049
- Qcur = ggml_rope_custom(
10050
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10010
+ Qcur = ggml_rope_ext(
10011
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10051
10012
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10052
10013
  ext_factor, attn_factor, beta_fast, beta_slow
10053
10014
  );
10054
10015
  cb(Qcur, "Qcur", il);
10055
10016
 
10056
- Kcur = ggml_rope_custom(
10057
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10017
+ Kcur = ggml_rope_ext(
10018
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10058
10019
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10059
10020
  ext_factor, attn_factor, beta_fast, beta_slow
10060
10021
  );
@@ -10166,8 +10127,8 @@ struct llm_build_context {
10166
10127
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10167
10128
  cb(Vcur, "Vcur", il);
10168
10129
 
10169
- Qcur = ggml_rope_custom(
10170
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10130
+ Qcur = ggml_rope_ext(
10131
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10171
10132
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10172
10133
  ext_factor, attn_factor, beta_fast, beta_slow);
10173
10134
  cb(Qcur, "Qcur", il);
@@ -10175,8 +10136,8 @@ struct llm_build_context {
10175
10136
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10176
10137
  cb(Qcur, "Qcur_scaled", il);
10177
10138
 
10178
- Kcur = ggml_rope_custom(
10179
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10139
+ Kcur = ggml_rope_ext(
10140
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10180
10141
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10181
10142
  ext_factor, attn_factor, beta_fast, beta_slow);
10182
10143
  cb(Kcur, "Kcur", il);
@@ -10286,15 +10247,15 @@ struct llm_build_context {
10286
10247
  cb(Vcur, "Vcur", il);
10287
10248
  }
10288
10249
 
10289
- Qcur = ggml_rope_custom(
10290
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10250
+ Qcur = ggml_rope_ext(
10251
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10291
10252
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10292
10253
  ext_factor, attn_factor, beta_fast, beta_slow
10293
10254
  );
10294
10255
  cb(Qcur, "Qcur", il);
10295
10256
 
10296
- Kcur = ggml_rope_custom(
10297
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10257
+ Kcur = ggml_rope_ext(
10258
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10298
10259
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10299
10260
  ext_factor, attn_factor, beta_fast, beta_slow
10300
10261
  );
@@ -10576,15 +10537,15 @@ struct llm_build_context {
10576
10537
  cb(Kcur, "Kcur", il);
10577
10538
  }
10578
10539
 
10579
- Qcur = ggml_rope_custom(
10580
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10540
+ Qcur = ggml_rope_ext(
10541
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10581
10542
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10582
10543
  ext_factor, attn_factor, beta_fast, beta_slow
10583
10544
  );
10584
10545
  cb(Qcur, "Qcur", il);
10585
10546
 
10586
- Kcur = ggml_rope_custom(
10587
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10547
+ Kcur = ggml_rope_ext(
10548
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10588
10549
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10589
10550
  ext_factor, attn_factor, beta_fast, beta_slow
10590
10551
  );
@@ -10707,15 +10668,15 @@ struct llm_build_context {
10707
10668
  cb(Vcur, "Vcur", il);
10708
10669
  }
10709
10670
 
10710
- Qcur = ggml_rope_custom(
10711
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10671
+ Qcur = ggml_rope_ext(
10672
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10712
10673
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10713
10674
  ext_factor, attn_factor, beta_fast, beta_slow
10714
10675
  );
10715
10676
  cb(Qcur, "Qcur", il);
10716
10677
 
10717
- Kcur = ggml_rope_custom(
10718
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10678
+ Kcur = ggml_rope_ext(
10679
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10719
10680
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10720
10681
  ext_factor, attn_factor, beta_fast, beta_slow
10721
10682
  );
@@ -10779,6 +10740,274 @@ struct llm_build_context {
10779
10740
 
10780
10741
  return gf;
10781
10742
  }
10743
+
10744
+ struct ggml_cgraph * build_gptneox() {
10745
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10746
+
10747
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10748
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10749
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10750
+
10751
+ struct ggml_tensor * cur;
10752
+ struct ggml_tensor * inpL;
10753
+
10754
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10755
+
10756
+ // inp_pos - contains the positions
10757
+ struct ggml_tensor * inp_pos = build_inp_pos();
10758
+
10759
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10760
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10761
+
10762
+ for (int il = 0; il < n_layer; ++il) {
10763
+ cur = llm_build_norm(ctx0, inpL, hparams,
10764
+ model.layers[il].attn_norm,
10765
+ model.layers[il].attn_norm_b,
10766
+ LLM_NORM, cb, il);
10767
+ cb(cur, "attn_norm", il);
10768
+
10769
+ // self-attention
10770
+ {
10771
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10772
+ cb(cur, "wqkv", il);
10773
+
10774
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10775
+ cb(cur, "bqkv", il);
10776
+
10777
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10778
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10779
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10780
+
10781
+ cb(Qcur, "Qcur", il);
10782
+ cb(Kcur, "Kcur", il);
10783
+ cb(Vcur, "Vcur", il);
10784
+
10785
+ Qcur = ggml_rope_ext(
10786
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10787
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10788
+ ext_factor, attn_factor, beta_fast, beta_slow
10789
+ );
10790
+ cb(Qcur, "Qcur", il);
10791
+
10792
+ Kcur = ggml_rope_ext(
10793
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10794
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10795
+ ext_factor, attn_factor, beta_fast, beta_slow
10796
+ );
10797
+ cb(Kcur, "Kcur", il);
10798
+
10799
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10800
+ model.layers[il].wo, model.layers[il].bo,
10801
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10802
+ }
10803
+
10804
+ if (il == n_layer - 1) {
10805
+ // skip computing output for unused tokens
10806
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10807
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10808
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10809
+ }
10810
+
10811
+ // ffn
10812
+ if (hparams.use_par_res) {
10813
+ // attention and ffn are computed in parallel
10814
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10815
+
10816
+ struct ggml_tensor * attn_out = cur;
10817
+
10818
+ cur = llm_build_norm(ctx0, inpL, hparams,
10819
+ model.layers[il].ffn_norm,
10820
+ model.layers[il].ffn_norm_b,
10821
+ LLM_NORM, cb, il);
10822
+ cb(cur, "ffn_norm", il);
10823
+
10824
+ cur = llm_build_ffn(ctx0, cur,
10825
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10826
+ NULL, NULL,
10827
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10828
+ NULL,
10829
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10830
+ cb(cur, "ffn_out", il);
10831
+
10832
+ cur = ggml_add(ctx0, cur, inpL);
10833
+ cb(cur, "ffn_out", il);
10834
+
10835
+ inpL = ggml_add(ctx0, cur, attn_out);
10836
+ cb(inpL, "l_out", il);
10837
+ } else {
10838
+ // attention and ffn are computed sequentially
10839
+ // x = x + attn(ln1(x))
10840
+ // x = x + ffn(ln2(x))
10841
+
10842
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10843
+ cb(ffn_inp, "ffn_inp", il);
10844
+
10845
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10846
+ model.layers[il].ffn_norm,
10847
+ model.layers[il].ffn_norm_b,
10848
+ LLM_NORM, cb, il);
10849
+ cb(cur, "ffn_norm", il);
10850
+
10851
+ cur = llm_build_ffn(ctx0, cur,
10852
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10853
+ NULL, NULL,
10854
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10855
+ NULL,
10856
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10857
+ cb(cur, "ffn_out", il);
10858
+
10859
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10860
+ cb(inpL, "l_out", il);
10861
+ }
10862
+ }
10863
+
10864
+ cur = llm_build_norm(ctx0, inpL, hparams,
10865
+ model.output_norm,
10866
+ model.output_norm_b,
10867
+ LLM_NORM, cb, -1);
10868
+ cb(cur, "result_norm", -1);
10869
+
10870
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10871
+ cb(cur, "result_output", -1);
10872
+
10873
+ ggml_build_forward_expand(gf, cur);
10874
+
10875
+ return gf;
10876
+ }
10877
+
10878
+ struct ggml_cgraph * build_arctic() {
10879
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10880
+
10881
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10882
+ int32_t n_tokens = this->n_tokens;
10883
+
10884
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10885
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10886
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10887
+
10888
+ struct ggml_tensor * cur;
10889
+ struct ggml_tensor * inpL;
10890
+
10891
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10892
+
10893
+ // inp_pos - contains the positions
10894
+ struct ggml_tensor * inp_pos = build_inp_pos();
10895
+
10896
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10897
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10898
+
10899
+ for (int il = 0; il < n_layer; ++il) {
10900
+ struct ggml_tensor * inpSA = inpL;
10901
+
10902
+ // norm
10903
+ cur = llm_build_norm(ctx0, inpL, hparams,
10904
+ model.layers[il].attn_norm, NULL,
10905
+ LLM_NORM_RMS, cb, il);
10906
+ cb(cur, "attn_norm", il);
10907
+
10908
+ // self-attention
10909
+ {
10910
+ // compute Q and K and RoPE them
10911
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10912
+ cb(Qcur, "Qcur", il);
10913
+
10914
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10915
+ cb(Kcur, "Kcur", il);
10916
+
10917
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10918
+ cb(Vcur, "Vcur", il);
10919
+
10920
+ Qcur = ggml_rope_ext(
10921
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10922
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10923
+ ext_factor, attn_factor, beta_fast, beta_slow
10924
+ );
10925
+ cb(Qcur, "Qcur", il);
10926
+
10927
+ Kcur = ggml_rope_ext(
10928
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10929
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10930
+ ext_factor, attn_factor, beta_fast, beta_slow
10931
+ );
10932
+ cb(Kcur, "Kcur", il);
10933
+
10934
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10935
+ model.layers[il].wo, NULL,
10936
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10937
+ }
10938
+
10939
+ if (il == n_layer - 1) {
10940
+ // skip computing output for unused tokens
10941
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10942
+ n_tokens = n_outputs;
10943
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10944
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10945
+ }
10946
+
10947
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10948
+ cb(ffn_inp, "ffn_inp", il);
10949
+
10950
+ // feed-forward network
10951
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10952
+ model.layers[il].ffn_norm, NULL,
10953
+ LLM_NORM_RMS, cb, il);
10954
+ cb(cur, "ffn_norm", il);
10955
+
10956
+ cur = llm_build_ffn(ctx0, cur,
10957
+ model.layers[il].ffn_up, NULL,
10958
+ model.layers[il].ffn_gate, NULL,
10959
+ model.layers[il].ffn_down, NULL,
10960
+ NULL,
10961
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10962
+ cb(cur, "ffn_out", il);
10963
+
10964
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
10965
+ cb(ffn_out, "ffn_out", il);
10966
+
10967
+ // MoE
10968
+ cur = llm_build_norm(ctx0, inpSA, hparams,
10969
+ model.layers[il].ffn_norm_exps, NULL,
10970
+ LLM_NORM_RMS, cb, il);
10971
+ cb(cur, "ffn_norm_exps", il);
10972
+
10973
+ cur = llm_build_moe_ffn(ctx0, cur,
10974
+ model.layers[il].ffn_gate_inp,
10975
+ model.layers[il].ffn_up_exps,
10976
+ model.layers[il].ffn_gate_exps,
10977
+ model.layers[il].ffn_down_exps,
10978
+ n_expert, n_expert_used,
10979
+ LLM_FFN_SILU, true,
10980
+ cb, il);
10981
+ cb(cur, "ffn_moe_out", il);
10982
+
10983
+ cur = ggml_add(ctx0, cur, ffn_out);
10984
+ cb(cur, "ffn_out", il);
10985
+
10986
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10987
+ if (layer_dir != nullptr) {
10988
+ cur = ggml_add(ctx0, cur, layer_dir);
10989
+ }
10990
+ cb(cur, "l_out", il);
10991
+
10992
+ // input for next layer
10993
+ inpL = cur;
10994
+ }
10995
+
10996
+ cur = inpL;
10997
+
10998
+ cur = llm_build_norm(ctx0, cur, hparams,
10999
+ model.output_norm, NULL,
11000
+ LLM_NORM_RMS, cb, -1);
11001
+ cb(cur, "result_norm", -1);
11002
+
11003
+ // lm_head
11004
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11005
+ cb(cur, "result_output", -1);
11006
+
11007
+ ggml_build_forward_expand(gf, cur);
11008
+
11009
+ return gf;
11010
+ }
10782
11011
  };
10783
11012
 
10784
11013
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -10895,10 +11124,6 @@ static struct ggml_cgraph * llama_build_graph(
10895
11124
  {
10896
11125
  result = llm.build_starcoder();
10897
11126
  } break;
10898
- case LLM_ARCH_PERSIMMON:
10899
- {
10900
- result = llm.build_persimmon();
10901
- } break;
10902
11127
  case LLM_ARCH_REFACT:
10903
11128
  {
10904
11129
  result = llm.build_refact();
@@ -10993,6 +11218,14 @@ static struct ggml_cgraph * llama_build_graph(
10993
11218
  {
10994
11219
  result = llm.build_olmo();
10995
11220
  } break;
11221
+ case LLM_ARCH_GPTNEOX:
11222
+ {
11223
+ result = llm.build_gptneox();
11224
+ } break;
11225
+ case LLM_ARCH_ARCTIC:
11226
+ {
11227
+ result = llm.build_arctic();
11228
+ } break;
10996
11229
  default:
10997
11230
  GGML_ASSERT(false);
10998
11231
  }
@@ -11338,11 +11571,6 @@ static void llama_graph_compute(
11338
11571
  llama_context & lctx,
11339
11572
  ggml_cgraph * gf,
11340
11573
  int n_threads) {
11341
- #ifdef GGML_USE_MPI
11342
- const int64_t n_layer = lctx.model.hparams.n_layer;
11343
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11344
- #endif
11345
-
11346
11574
  #ifdef GGML_USE_METAL
11347
11575
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11348
11576
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11357,10 +11585,6 @@ static void llama_graph_compute(
11357
11585
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11358
11586
 
11359
11587
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11360
-
11361
- #ifdef GGML_USE_MPI
11362
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11363
- #endif
11364
11588
  }
11365
11589
 
11366
11590
  // decode a batch of tokens by evaluating the transformer
@@ -11398,12 +11622,6 @@ static int llama_decode_internal(
11398
11622
  }
11399
11623
  lctx.n_queued_tokens += n_tokens_all;
11400
11624
 
11401
- #ifdef GGML_USE_MPI
11402
- // TODO: needs fix after #3228
11403
- GGML_ASSERT(false && "not implemented");
11404
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11405
- #endif
11406
-
11407
11625
  auto & kv_self = lctx.kv_self;
11408
11626
 
11409
11627
  const int64_t n_embd = hparams.n_embd;
@@ -12297,6 +12515,7 @@ struct llm_tokenizer_bpe {
12297
12515
  });
12298
12516
  break;
12299
12517
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12518
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12300
12519
  word_collection = unicode_regex_split(text, {
12301
12520
  // same as llama3
12302
12521
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12353,6 +12572,7 @@ struct llm_tokenizer_bpe {
12353
12572
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12354
12573
  });
12355
12574
  break;
12575
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12356
12576
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12357
12577
  word_collection = unicode_regex_split(text, {
12358
12578
  // original regex from tokenizer.json
@@ -12575,16 +12795,16 @@ struct llm_tokenizer_wpm {
12575
12795
  // to lowercase, pad chinese characters, pad punctuation
12576
12796
  std::string new_str = "";
12577
12797
  for (uint32_t code : cpts_nfd) {
12578
- int type = unicode_cpt_type(code);
12579
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12798
+ const codepoint_flags flags = unicode_cpt_flags(code);
12799
+ if (flags.is_accent_mark || flags.is_control) {
12580
12800
  continue;
12581
12801
  }
12582
12802
  code = unicode_tolower(code);
12583
- if (type == CODEPOINT_TYPE_SEPARATOR) {
12803
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12584
12804
  code = ' ';
12585
12805
  }
12586
12806
  std::string s = unicode_cpt_to_utf8(code);
12587
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12807
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12588
12808
  new_str += " ";
12589
12809
  new_str += s;
12590
12810
  new_str += " ";
@@ -12787,9 +13007,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12787
13007
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12788
13008
  // tokenizer.encode('', add_special_tokens=False) returns []
12789
13009
 
13010
+ static const bool rtrim = true; //TODO: as param
13011
+ bool is_prev_special = false;
13012
+ bool special_token_rtrim = false;
13013
+
12790
13014
  if (add_special && vocab.special_add_bos != 0) {
12791
13015
  GGML_ASSERT(vocab.special_bos_id != -1);
12792
13016
  output.push_back(vocab.special_bos_id);
13017
+ is_prev_special = true;
12793
13018
  }
12794
13019
 
12795
13020
  for (const auto & fragment : fragment_buffer) {
@@ -12801,9 +13026,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12801
13026
  // and passing 'add space prefix' as bool argument
12802
13027
  //
12803
13028
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12804
- if (&fragment == &fragment_buffer.front()) {
12805
- if (vocab.add_space_prefix) {
12806
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13029
+
13030
+ if (special_token_rtrim) {
13031
+ size_t num_whitespaces = 0;
13032
+ while (isspace(raw_text[num_whitespaces])) {
13033
+ num_whitespaces++;
13034
+ }
13035
+ if (num_whitespaces == raw_text.size()) {
13036
+ continue; // skip if all whitespaces
13037
+ }
13038
+ raw_text = raw_text.substr(num_whitespaces);
13039
+ }
13040
+
13041
+ if (vocab.add_space_prefix) {
13042
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13043
+ raw_text = " " + raw_text;
12807
13044
  }
12808
13045
  }
12809
13046
 
@@ -12815,9 +13052,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12815
13052
  tokenizer.tokenize(raw_text, output);
12816
13053
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12817
13054
  output.push_back(fragment.token);
13055
+ is_prev_special = true;
13056
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13057
+ special_token_rtrim = rtrim
13058
+ && fragment.token != vocab.special_bos_id
13059
+ && fragment.token != vocab.special_unk_id
13060
+ && fragment.token != vocab.special_eos_id;
12818
13061
  }
12819
13062
  }
12820
13063
 
13064
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13065
+ LLAMA_LOG_WARN(
13066
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13067
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13068
+ "Are you sure this is what you want?\n", __FUNCTION__);
13069
+ }
13070
+
12821
13071
  if (add_special && vocab.special_add_eos == 1) {
12822
13072
  GGML_ASSERT(vocab.special_eos_id != -1);
12823
13073
  output.push_back(vocab.special_eos_id);
@@ -12844,6 +13094,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12844
13094
  }
12845
13095
  }
12846
13096
 
13097
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13098
+ LLAMA_LOG_WARN(
13099
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13100
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13101
+ "Are you sure this is what you want?\n", __FUNCTION__);
13102
+ }
13103
+
12847
13104
  if (add_special && vocab.special_add_eos == 1) {
12848
13105
  GGML_ASSERT(vocab.special_add_eos != -1);
12849
13106
  output.push_back(vocab.special_eos_id);
@@ -13904,9 +14161,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13904
14161
 
13905
14162
  // Sample the next word X using top-k sampling
13906
14163
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13907
- if (ctx) {
13908
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13909
- }
14164
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13910
14165
  llama_token X = llama_sample_token(ctx, candidates);
13911
14166
  t_start_sample_us = ggml_time_us();
13912
14167
 
@@ -13920,9 +14175,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13920
14175
  // Update mu using the learning rate and error
13921
14176
  *mu = *mu - eta * e;
13922
14177
 
13923
- if (ctx) {
13924
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13925
- }
14178
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13926
14179
  return X;
13927
14180
  }
13928
14181
 
@@ -14507,8 +14760,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14507
14760
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14508
14761
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14509
14762
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14510
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14511
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14512
14763
  if (qs.model.type == MODEL_70B) {
14513
14764
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14514
14765
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15522,10 +15773,6 @@ void llama_backend_init(void) {
15522
15773
  struct ggml_context * ctx = ggml_init(params);
15523
15774
  ggml_free(ctx);
15524
15775
  }
15525
-
15526
- #ifdef GGML_USE_MPI
15527
- ggml_mpi_backend_init();
15528
- #endif
15529
15776
  }
15530
15777
 
15531
15778
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15535,9 +15782,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15535
15782
  }
15536
15783
 
15537
15784
  void llama_backend_free(void) {
15538
- #ifdef GGML_USE_MPI
15539
- ggml_mpi_backend_free();
15540
- #endif
15541
15785
  ggml_quantize_free();
15542
15786
  }
15543
15787
 
@@ -15680,6 +15924,7 @@ struct llama_context * llama_new_context_with_model(
15680
15924
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15681
15925
  }
15682
15926
 
15927
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15683
15928
  cparams.causal_attn = hparams.causal_attn;
15684
15929
 
15685
15930
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15938,20 +16183,6 @@ struct llama_context * llama_new_context_with_model(
15938
16183
  }
15939
16184
  }
15940
16185
 
15941
- #ifdef GGML_USE_MPI
15942
- ctx->ctx_mpi = ggml_mpi_init();
15943
-
15944
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15945
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15946
- // TODO: needs fix after #3228
15947
- GGML_ASSERT(false && "not implemented");
15948
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15949
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15950
- llama_backend_free();
15951
- exit(1);
15952
- }
15953
- #endif
15954
-
15955
16186
  return ctx;
15956
16187
  }
15957
16188
 
@@ -15988,7 +16219,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15988
16219
  // these models do not use RoPE
15989
16220
  case LLM_ARCH_GPT2:
15990
16221
  case LLM_ARCH_GPTJ:
15991
- case LLM_ARCH_GPTNEOX:
15992
16222
  case LLM_ARCH_MPT:
15993
16223
  case LLM_ARCH_REFACT:
15994
16224
  case LLM_ARCH_BLOOM:
@@ -16008,13 +16238,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16008
16238
  case LLM_ARCH_XVERSE:
16009
16239
  case LLM_ARCH_COMMAND_R:
16010
16240
  case LLM_ARCH_OLMO:
16241
+ case LLM_ARCH_ARCTIC:
16011
16242
  return LLAMA_ROPE_TYPE_NORM;
16012
16243
 
16013
16244
  // the pairs of head values are offset by n_rot/2
16014
16245
  case LLM_ARCH_FALCON:
16015
16246
  case LLM_ARCH_GROK:
16016
16247
  case LLM_ARCH_DBRX:
16017
- case LLM_ARCH_PERSIMMON:
16018
16248
  case LLM_ARCH_BERT:
16019
16249
  case LLM_ARCH_NOMIC_BERT:
16020
16250
  case LLM_ARCH_STABLELM:
@@ -16025,6 +16255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16025
16255
  case LLM_ARCH_PHI3:
16026
16256
  case LLM_ARCH_GEMMA:
16027
16257
  case LLM_ARCH_STARCODER2:
16258
+ case LLM_ARCH_GPTNEOX:
16028
16259
  return LLAMA_ROPE_TYPE_NEOX;
16029
16260
 
16030
16261
  // all model arches should be listed explicitly here
@@ -16184,6 +16415,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16184
16415
  }
16185
16416
 
16186
16417
  // make tensors
16418
+ cvec.tensors.reserve(model.hparams.n_layer);
16187
16419
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16188
16420
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16189
16421
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16192,6 +16424,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16192
16424
  }
16193
16425
 
16194
16426
  // allocate tensors / buffers and zero
16427
+ cvec.ctxs.reserve(ctx_map.size());
16428
+ cvec.bufs.reserve(ctx_map.size());
16195
16429
  for (auto it : ctx_map) {
16196
16430
  ggml_backend_buffer_type_t buft = it.first;
16197
16431
  ggml_context * ctx = it.second;
@@ -17015,13 +17249,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
17015
17249
  }
17016
17250
  else {
17017
17251
  if (cell_range_begin != kv_self.size) {
17018
- cell_ranges.push_back({ cell_range_begin, i });
17252
+ cell_ranges.emplace_back(cell_range_begin, i);
17019
17253
  cell_range_begin = kv_self.size;
17020
17254
  }
17021
17255
  }
17022
17256
  }
17023
17257
  if (cell_range_begin != kv_self.size) {
17024
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17258
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
17025
17259
  }
17026
17260
 
17027
17261
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -17400,6 +17634,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17400
17634
  ctx->cparams.n_threads_batch = n_threads_batch;
17401
17635
  }
17402
17636
 
17637
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17638
+ return ctx->cparams.n_threads;
17639
+ }
17640
+
17641
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17642
+ return ctx->cparams.n_threads_batch;
17643
+ }
17644
+
17403
17645
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17404
17646
  ctx->abort_callback = abort_callback;
17405
17647
  ctx->abort_callback_data = abort_callback_data;
@@ -17623,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17623
17865
  );
17624
17866
  }
17625
17867
 
17868
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
17869
+ return llama_is_control_token(model->vocab, token);
17870
+ }
17871
+
17626
17872
  llama_token llama_token_bos(const struct llama_model * model) {
17627
17873
  return model->vocab.special_bos_id;
17628
17874
  }
@@ -17834,6 +18080,15 @@ static int32_t llama_chat_apply_template_internal(
17834
18080
  }
17835
18081
  }
17836
18082
  // llama2 templates seem to not care about "add_generation_prompt"
18083
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18084
+ // Phi 3
18085
+ for (auto message : chat) {
18086
+ std::string role(message->role);
18087
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18088
+ }
18089
+ if (add_ass) {
18090
+ ss << "<|assistant|>\n";
18091
+ }
17837
18092
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17838
18093
  // zephyr template
17839
18094
  for (auto message : chat) {
@@ -17966,15 +18221,6 @@ static int32_t llama_chat_apply_template_internal(
17966
18221
  if (add_ass) {
17967
18222
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17968
18223
  }
17969
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17970
- // Phi 3
17971
- for (auto message : chat) {
17972
- std::string role(message->role);
17973
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17974
- }
17975
- if (add_ass) {
17976
- ss << "<|assistant|>\n";
17977
- }
17978
18224
  } else {
17979
18225
  // template not supported
17980
18226
  return -1;
@@ -18096,8 +18342,10 @@ const char * llama_print_system_info(void) {
18096
18342
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
18097
18343
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
18098
18344
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18345
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18099
18346
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18100
18347
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18348
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18101
18349
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18102
18350
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18103
18351
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
@@ -18156,6 +18404,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
18156
18404
  g_state.log_callback_user_data = user_data;
18157
18405
  #ifdef GGML_USE_METAL
18158
18406
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18407
+ #elif defined(GGML_USE_CUDA)
18408
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18159
18409
  #endif
18160
18410
  }
18161
18411