@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -22,16 +26,9 @@
22
26
  #ifdef GGML_USE_METAL
23
27
  # include "ggml-metal.h"
24
28
  #endif
25
- #ifdef GGML_USE_MPI
26
- # include "ggml-mpi.h"
27
- #endif
28
- #ifndef QK_K
29
- # ifdef GGML_QKK_64
30
- # define QK_K 64
31
- # else
32
- # define QK_K 256
33
- # endif
34
- #endif
29
+
30
+ // TODO: replace with ggml API call
31
+ #define QK_K 256
35
32
 
36
33
  #ifdef __has_include
37
34
  #if __has_include(<unistd.h>)
@@ -106,7 +103,7 @@
106
103
  #endif
107
104
 
108
105
  #define LLAMA_MAX_NODES 8192
109
- #define LLAMA_MAX_EXPERTS 60
106
+ #define LLAMA_MAX_EXPERTS 128
110
107
 
111
108
  //
112
109
  // logging
@@ -201,10 +198,10 @@ enum llm_arch {
201
198
  LLM_ARCH_GPTNEOX,
202
199
  LLM_ARCH_MPT,
203
200
  LLM_ARCH_STARCODER,
204
- LLM_ARCH_PERSIMMON,
205
201
  LLM_ARCH_REFACT,
206
202
  LLM_ARCH_BERT,
207
203
  LLM_ARCH_NOMIC_BERT,
204
+ LLM_ARCH_JINA_BERT_V2,
208
205
  LLM_ARCH_BLOOM,
209
206
  LLM_ARCH_STABLELM,
210
207
  LLM_ARCH_QWEN,
@@ -224,43 +221,45 @@ enum llm_arch {
224
221
  LLM_ARCH_COMMAND_R,
225
222
  LLM_ARCH_DBRX,
226
223
  LLM_ARCH_OLMO,
224
+ LLM_ARCH_ARCTIC,
227
225
  LLM_ARCH_UNKNOWN,
228
226
  };
229
227
 
230
228
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
229
+ { LLM_ARCH_LLAMA, "llama" },
230
+ { LLM_ARCH_FALCON, "falcon" },
231
+ { LLM_ARCH_GROK, "grok" },
232
+ { LLM_ARCH_GPT2, "gpt2" },
233
+ { LLM_ARCH_GPTJ, "gptj" },
234
+ { LLM_ARCH_GPTNEOX, "gptneox" },
235
+ { LLM_ARCH_MPT, "mpt" },
236
+ { LLM_ARCH_BAICHUAN, "baichuan" },
237
+ { LLM_ARCH_STARCODER, "starcoder" },
238
+ { LLM_ARCH_REFACT, "refact" },
239
+ { LLM_ARCH_BERT, "bert" },
240
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
241
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
242
+ { LLM_ARCH_BLOOM, "bloom" },
243
+ { LLM_ARCH_STABLELM, "stablelm" },
244
+ { LLM_ARCH_QWEN, "qwen" },
245
+ { LLM_ARCH_QWEN2, "qwen2" },
246
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
247
+ { LLM_ARCH_PHI2, "phi2" },
248
+ { LLM_ARCH_PHI3, "phi3" },
249
+ { LLM_ARCH_PLAMO, "plamo" },
250
+ { LLM_ARCH_CODESHELL, "codeshell" },
251
+ { LLM_ARCH_ORION, "orion" },
252
+ { LLM_ARCH_INTERNLM2, "internlm2" },
253
+ { LLM_ARCH_MINICPM, "minicpm" },
254
+ { LLM_ARCH_GEMMA, "gemma" },
255
+ { LLM_ARCH_STARCODER2, "starcoder2" },
256
+ { LLM_ARCH_MAMBA, "mamba" },
257
+ { LLM_ARCH_XVERSE, "xverse" },
258
+ { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
261
+ { LLM_ARCH_ARCTIC, "arctic" },
262
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
263
  };
265
264
 
266
265
  enum llm_kv {
@@ -303,6 +302,7 @@ enum llm_kv {
303
302
  LLM_KV_ROPE_SCALE_LINEAR,
304
303
  LLM_KV_ROPE_SCALING_TYPE,
305
304
  LLM_KV_ROPE_SCALING_FACTOR,
305
+ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
306
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
307
  LLM_KV_ROPE_SCALING_FINETUNED,
308
308
 
@@ -380,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
380
380
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
381
381
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
382
382
  { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
383
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
383
384
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
384
385
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
385
386
 
@@ -435,6 +436,8 @@ enum llm_tensor {
435
436
  LLM_TENSOR_OUTPUT,
436
437
  LLM_TENSOR_OUTPUT_NORM,
437
438
  LLM_TENSOR_ROPE_FREQS,
439
+ LLM_TENSOR_ROPE_FACTORS_LONG,
440
+ LLM_TENSOR_ROPE_FACTORS_SHORT,
438
441
  LLM_TENSOR_ATTN_Q,
439
442
  LLM_TENSOR_ATTN_K,
440
443
  LLM_TENSOR_ATTN_V,
@@ -454,6 +457,7 @@ enum llm_tensor {
454
457
  LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
455
458
  LLM_TENSOR_FFN_GATE_EXP,
456
459
  LLM_TENSOR_FFN_UP_EXP,
460
+ LLM_TENSOR_FFN_NORM_EXPS,
457
461
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
458
462
  LLM_TENSOR_FFN_GATE_EXPS,
459
463
  LLM_TENSOR_FFN_UP_EXPS,
@@ -592,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
592
596
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
593
597
  },
594
598
  },
595
- {
596
- LLM_ARCH_PERSIMMON,
597
- {
598
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
599
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
600
- { LLM_TENSOR_OUTPUT, "output"},
601
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
602
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
603
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
604
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
605
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
606
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
607
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
608
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
609
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
610
- },
611
- },
612
599
  {
613
600
  LLM_ARCH_MPT,
614
601
  {
@@ -691,6 +678,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
678
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
679
  },
693
680
  },
681
+ {
682
+ LLM_ARCH_JINA_BERT_V2,
683
+ {
684
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
685
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
686
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
687
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
688
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
689
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
690
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
691
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
692
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
693
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
694
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
695
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
696
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
697
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
698
+ },
699
+ },
694
700
  {
695
701
  LLM_ARCH_BLOOM,
696
702
  {
@@ -800,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
800
806
  {
801
807
  LLM_ARCH_PHI3,
802
808
  {
803
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
804
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
805
- { LLM_TENSOR_OUTPUT, "output" },
806
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
807
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
808
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
809
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
810
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
811
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
812
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
813
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
814
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
809
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
810
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
811
+ { LLM_TENSOR_OUTPUT, "output" },
812
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
813
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
816
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
817
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
818
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
819
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
815
823
  },
816
824
  },
817
825
  {
@@ -1027,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1027
1035
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1028
1036
  },
1029
1037
  },
1038
+ {
1039
+ LLM_ARCH_ARCTIC,
1040
+ {
1041
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1042
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1043
+ { LLM_TENSOR_OUTPUT, "output" },
1044
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1050
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1051
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1052
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1053
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1054
+ { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
1055
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1056
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1057
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
+ },
1059
+ },
1030
1060
  {
1031
1061
  LLM_ARCH_UNKNOWN,
1032
1062
  {
@@ -1664,91 +1694,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1694
  GGML_UNUSED(host_buffer);
1665
1695
  }
1666
1696
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1697
  //
1753
1698
  // globals
1754
1699
  //
@@ -1757,6 +1702,8 @@ struct llama_state {
1757
1702
  llama_state() {
1758
1703
  #ifdef GGML_USE_METAL
1759
1704
  ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1705
+ #elif defined(GGML_USE_CUDA)
1706
+ ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
1760
1707
  #endif
1761
1708
  }
1762
1709
 
@@ -1770,17 +1717,24 @@ static llama_state g_state;
1770
1717
  // available llama models
1771
1718
  enum e_model {
1772
1719
  MODEL_UNKNOWN,
1720
+ MODEL_14M,
1773
1721
  MODEL_17M,
1774
1722
  MODEL_22M,
1775
1723
  MODEL_33M,
1724
+ MODEL_70M,
1776
1725
  MODEL_109M,
1777
1726
  MODEL_137M,
1727
+ MODEL_160M,
1778
1728
  MODEL_335M,
1729
+ MODEL_410M,
1779
1730
  MODEL_0_5B,
1780
1731
  MODEL_1B,
1732
+ MODEL_1_4B,
1781
1733
  MODEL_2B,
1734
+ MODEL_2_8B,
1782
1735
  MODEL_3B,
1783
1736
  MODEL_4B,
1737
+ MODEL_6_9B,
1784
1738
  MODEL_7B,
1785
1739
  MODEL_8B,
1786
1740
  MODEL_12B,
@@ -1803,6 +1757,7 @@ enum e_model {
1803
1757
  MODEL_8x7B,
1804
1758
  MODEL_8x22B,
1805
1759
  MODEL_16x12B,
1760
+ MODEL_10B_128x3_66B,
1806
1761
  };
1807
1762
 
1808
1763
  static const size_t kiB = 1024;
@@ -1812,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
1812
1767
  struct llama_hparams {
1813
1768
  bool vocab_only;
1814
1769
  bool rope_finetuned;
1770
+ bool use_par_res;
1815
1771
 
1816
1772
  uint32_t n_vocab;
1817
1773
  uint32_t n_ctx_train; // context size the model was trained on
@@ -1830,6 +1786,7 @@ struct llama_hparams {
1830
1786
  float f_norm_eps;
1831
1787
  float f_norm_rms_eps;
1832
1788
 
1789
+ float rope_attn_factor = 1.0f;
1833
1790
  float rope_freq_base_train;
1834
1791
  float rope_freq_scale_train;
1835
1792
  uint32_t n_yarn_orig_ctx;
@@ -1845,7 +1802,7 @@ struct llama_hparams {
1845
1802
  float f_logit_scale = 0.0f;
1846
1803
 
1847
1804
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1805
+ bool use_alibi = false;
1849
1806
 
1850
1807
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1808
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1878,6 +1835,7 @@ struct llama_hparams {
1878
1835
 
1879
1836
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1880
1837
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1838
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1881
1839
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1882
1840
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1883
1841
 
@@ -1975,6 +1933,7 @@ struct llama_layer {
1975
1933
  struct ggml_tensor * ffn_norm_b;
1976
1934
  struct ggml_tensor * layer_out_norm;
1977
1935
  struct ggml_tensor * layer_out_norm_b;
1936
+ struct ggml_tensor * ffn_norm_exps;
1978
1937
 
1979
1938
  // ff
1980
1939
  struct ggml_tensor * ffn_gate; // w1
@@ -2012,6 +1971,10 @@ struct llama_layer {
2012
1971
  // mamba bias
2013
1972
  struct ggml_tensor * ssm_conv1d_b;
2014
1973
  struct ggml_tensor * ssm_dt_b;
1974
+
1975
+ // long rope factors
1976
+ struct ggml_tensor * rope_long = nullptr;
1977
+ struct ggml_tensor * rope_short = nullptr;
2015
1978
  };
2016
1979
 
2017
1980
  struct llama_kv_cell {
@@ -2189,6 +2152,8 @@ struct llama_model {
2189
2152
  int main_gpu;
2190
2153
  int n_gpu_layers;
2191
2154
 
2155
+ std::vector<std::string> rpc_servers;
2156
+
2192
2157
  // gguf metadata
2193
2158
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2159
 
@@ -2317,7 +2282,6 @@ struct llama_context {
2317
2282
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2283
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2284
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2285
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2286
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2287
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2327,11 +2291,105 @@ struct llama_context {
2327
2291
 
2328
2292
  // control vectors
2329
2293
  struct llama_control_vector cvec;
2294
+ };
2295
+
2296
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2297
+ ggml_backend_buffer_type_t buft = nullptr;
2298
+
2299
+ #ifdef GGML_USE_RPC
2300
+ std::string endpoint = model.rpc_servers[gpu];
2301
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2302
+ #elif defined(GGML_USE_METAL)
2303
+ buft = ggml_backend_metal_buffer_type();
2304
+ #elif defined(GGML_USE_CUDA)
2305
+ buft = ggml_backend_cuda_buffer_type(gpu);
2306
+ #elif defined(GGML_USE_VULKAN)
2307
+ buft = ggml_backend_vk_buffer_type(gpu);
2308
+ #elif defined(GGML_USE_SYCL)
2309
+ buft = ggml_backend_sycl_buffer_type(gpu);
2310
+ #elif defined(GGML_USE_CLBLAST)
2311
+ buft = ggml_backend_opencl_buffer_type();
2312
+ #elif defined(GGML_USE_KOMPUTE)
2313
+ buft = ggml_backend_kompute_buffer_type(gpu);
2314
+ if (buft == nullptr) {
2315
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2316
+ }
2317
+ #endif
2318
+
2319
+ if (buft == nullptr) {
2320
+ buft = llama_default_buffer_type_cpu(true);
2321
+ }
2322
+ return buft;
2323
+ GGML_UNUSED(model);
2324
+ GGML_UNUSED(gpu);
2325
+ }
2330
2326
 
2331
- #ifdef GGML_USE_MPI
2332
- ggml_mpi_context * ctx_mpi = NULL;
2327
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2328
+ ggml_backend_buffer_type_t buft = nullptr;
2329
+
2330
+ #ifdef GGML_USE_CUDA
2331
+ if (ggml_backend_cuda_get_device_count() > 1) {
2332
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2333
+ }
2333
2334
  #endif
2334
- };
2335
+
2336
+ #ifdef GGML_USE_SYCL
2337
+ if (ggml_backend_sycl_get_device_count() > 1) {
2338
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2339
+ }
2340
+ #endif
2341
+
2342
+ if (buft == nullptr) {
2343
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2344
+ }
2345
+ return buft;
2346
+
2347
+ GGML_UNUSED(tensor_split);
2348
+ }
2349
+
2350
+ static size_t llama_get_device_count(const llama_model & model) {
2351
+ #if defined(GGML_USE_RPC)
2352
+ return model.rpc_servers.size();
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ return ggml_backend_cuda_get_device_count();
2355
+ #elif defined(GGML_USE_SYCL)
2356
+ return ggml_backend_sycl_get_device_count();
2357
+ #elif defined(GGML_USE_VULKAN)
2358
+ return ggml_backend_vk_get_device_count();
2359
+ #else
2360
+ return 1;
2361
+ #endif
2362
+ GGML_UNUSED(model);
2363
+ }
2364
+
2365
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2366
+ #if defined(GGML_USE_RPC)
2367
+ size_t total;
2368
+ size_t free;
2369
+ std::string endpoint = model.rpc_servers[device];
2370
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2371
+ return free;
2372
+ #elif defined(GGML_USE_CUDA)
2373
+ size_t total;
2374
+ size_t free;
2375
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2376
+ return free;
2377
+ #elif defined(GGML_USE_SYCL)
2378
+ size_t total;
2379
+ size_t free;
2380
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2381
+ return free;
2382
+ #elif defined(GGML_USE_VULKAN)
2383
+ size_t total;
2384
+ size_t free;
2385
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2386
+ return free;
2387
+ #else
2388
+ return 1;
2389
+ #endif
2390
+ GGML_UNUSED(model);
2391
+ GGML_UNUSED(device);
2392
+ }
2335
2393
 
2336
2394
  //
2337
2395
  // kv cache helpers
@@ -2452,7 +2510,6 @@ static bool llama_kv_cache_init(
2452
2510
  static bool llama_kv_cache_find_slot(
2453
2511
  struct llama_kv_cache & cache,
2454
2512
  const struct llama_batch & batch) {
2455
- const uint32_t n_ctx = cache.size;
2456
2513
  const uint32_t n_tokens = batch.n_tokens;
2457
2514
 
2458
2515
  if (cache.recurrent) {
@@ -2503,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
2503
2560
  }
2504
2561
  // otherwise, one cell per token.
2505
2562
 
2506
- if (n_tokens > n_ctx) {
2507
- LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2563
+ if (n_tokens > cache.size) {
2564
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
2508
2565
  return false;
2509
2566
  }
2510
2567
 
2511
2568
  uint32_t n_tested = 0;
2512
2569
 
2513
2570
  while (true) {
2514
- if (cache.head + n_tokens > n_ctx) {
2515
- n_tested += n_ctx - cache.head;
2571
+ if (cache.head + n_tokens > cache.size) {
2572
+ n_tested += cache.size - cache.head;
2516
2573
  cache.head = 0;
2517
2574
  continue;
2518
2575
  }
@@ -2531,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
2531
2588
  break;
2532
2589
  }
2533
2590
 
2534
- if (n_tested >= n_ctx) {
2591
+ if (n_tested >= cache.size) {
2535
2592
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
2536
2593
  return false;
2537
2594
  }
@@ -2785,6 +2842,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2842
  cache.do_defrag = true;
2786
2843
  }
2787
2844
 
2845
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2846
+ // the FA kernels require padding to avoid extra runtime boundary checks
2847
+ return cparams.flash_attn ? 256u : 32u;
2848
+ }
2849
+
2788
2850
  //
2789
2851
  // model loading and saving
2790
2852
  //
@@ -3287,22 +3349,55 @@ struct llama_model_loader {
3287
3349
  }
3288
3350
 
3289
3351
  template<typename T>
3290
- bool get_key(const std::string & key, T & result, const bool required = true) {
3291
- auto it = kv_overrides.find(key);
3352
+ bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
3353
+ const int kid = gguf_find_key(meta, key.c_str());
3292
3354
 
3293
- const struct llama_model_kv_override * override =
3294
- it != kv_overrides.end() ? &it->second : nullptr;
3355
+ if (kid < 0) {
3356
+ if (required) {
3357
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3358
+ }
3359
+ return false;
3360
+ }
3295
3361
 
3296
- const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3362
+ struct GGUFMeta::ArrayInfo arr_info =
3363
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
3297
3364
 
3298
- if (required && !found) {
3299
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3365
+ if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
3366
+ throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
3300
3367
  }
3301
3368
 
3302
- return found;
3303
- }
3369
+ // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
3370
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
3371
+ GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
3304
3372
 
3305
- template<typename T>
3373
+ result.resize(arr_info.length);
3374
+ result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
3375
+
3376
+ return true;
3377
+ }
3378
+
3379
+ template<typename T>
3380
+ bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
3381
+ return get_arr(llm_kv(kid), result, required);
3382
+ }
3383
+
3384
+ template<typename T>
3385
+ bool get_key(const std::string & key, T & result, const bool required = true) {
3386
+ auto it = kv_overrides.find(key);
3387
+
3388
+ const struct llama_model_kv_override * override =
3389
+ it != kv_overrides.end() ? &it->second : nullptr;
3390
+
3391
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3392
+
3393
+ if (required && !found) {
3394
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
3395
+ }
3396
+
3397
+ return found;
3398
+ }
3399
+
3400
+ template<typename T>
3306
3401
  bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
3307
3402
  return get_key(llm_kv(kid), result, required);
3308
3403
  }
@@ -3360,11 +3455,15 @@ struct llama_model_loader {
3360
3455
  return get_tensor_meta(get_tensor_name(i));
3361
3456
  }
3362
3457
 
3363
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3458
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
3364
3459
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3365
3460
  ggml_set_name(tensor, ggml_get_name(cur));
3366
3461
 
3367
- n_created++;
3462
+ if (duplicated) {
3463
+ size_data += ggml_nbytes(cur);
3464
+ } else {
3465
+ n_created++;
3466
+ }
3368
3467
 
3369
3468
  return tensor;
3370
3469
  }
@@ -3399,14 +3498,17 @@ struct llama_model_loader {
3399
3498
  return cur;
3400
3499
  }
3401
3500
 
3402
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3403
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3501
+ static const int TENSOR_NOT_REQUIRED = 1;
3502
+ static const int TENSOR_DUPLICATED = 2;
3503
+
3504
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
3505
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
3404
3506
 
3405
3507
  if (cur == NULL) {
3406
3508
  return NULL;
3407
3509
  }
3408
3510
 
3409
- return create_tensor_for(ctx, cur);
3511
+ return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
3410
3512
  }
3411
3513
 
3412
3514
  struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@@ -3706,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3706
3808
 
3707
3809
  static const char * llama_model_type_name(e_model type) {
3708
3810
  switch (type) {
3709
- case MODEL_22M: return "22M";
3710
- case MODEL_33M: return "33M";
3711
- case MODEL_109M: return "109M";
3712
- case MODEL_137M: return "137M";
3713
- case MODEL_0_5B: return "0.5B";
3714
- case MODEL_1B: return "1B";
3715
- case MODEL_2B: return "2B";
3716
- case MODEL_3B: return "3B";
3717
- case MODEL_7B: return "7B";
3718
- case MODEL_8B: return "8B";
3719
- case MODEL_12B: return "12B";
3720
- case MODEL_13B: return "13B";
3721
- case MODEL_14B: return "14B";
3722
- case MODEL_15B: return "15B";
3723
- case MODEL_20B: return "20B";
3724
- case MODEL_30B: return "30B";
3725
- case MODEL_34B: return "34B";
3726
- case MODEL_35B: return "35B";
3727
- case MODEL_40B: return "40B";
3728
- case MODEL_65B: return "65B";
3729
- case MODEL_70B: return "70B";
3730
- case MODEL_314B: return "314B";
3731
- case MODEL_SMALL: return "0.1B";
3732
- case MODEL_MEDIUM: return "0.4B";
3733
- case MODEL_LARGE: return "0.8B";
3734
- case MODEL_XL: return "1.5B";
3735
- case MODEL_A2_7B: return "A2.7B";
3736
- case MODEL_8x7B: return "8x7B";
3737
- case MODEL_8x22B: return "8x22B";
3738
- case MODEL_16x12B: return "16x12B";
3739
- default: return "?B";
3811
+ case MODEL_14M: return "14M";
3812
+ case MODEL_17M: return "17M";
3813
+ case MODEL_22M: return "22M";
3814
+ case MODEL_33M: return "33M";
3815
+ case MODEL_70M: return "70M";
3816
+ case MODEL_109M: return "109M";
3817
+ case MODEL_137M: return "137M";
3818
+ case MODEL_160M: return "160M";
3819
+ case MODEL_335M: return "335M";
3820
+ case MODEL_410M: return "410M";
3821
+ case MODEL_0_5B: return "0.5B";
3822
+ case MODEL_1B: return "1B";
3823
+ case MODEL_1_4B: return "1.4B";
3824
+ case MODEL_2B: return "2B";
3825
+ case MODEL_2_8B: return "2.8B";
3826
+ case MODEL_3B: return "3B";
3827
+ case MODEL_4B: return "4B";
3828
+ case MODEL_6_9B: return "6.9B";
3829
+ case MODEL_7B: return "7B";
3830
+ case MODEL_8B: return "8B";
3831
+ case MODEL_12B: return "12B";
3832
+ case MODEL_13B: return "13B";
3833
+ case MODEL_14B: return "14B";
3834
+ case MODEL_15B: return "15B";
3835
+ case MODEL_20B: return "20B";
3836
+ case MODEL_30B: return "30B";
3837
+ case MODEL_34B: return "34B";
3838
+ case MODEL_35B: return "35B";
3839
+ case MODEL_40B: return "40B";
3840
+ case MODEL_65B: return "65B";
3841
+ case MODEL_70B: return "70B";
3842
+ case MODEL_314B: return "314B";
3843
+ case MODEL_SMALL: return "0.1B";
3844
+ case MODEL_MEDIUM: return "0.4B";
3845
+ case MODEL_LARGE: return "0.8B";
3846
+ case MODEL_XL: return "1.5B";
3847
+ case MODEL_A2_7B: return "A2.7B";
3848
+ case MODEL_8x7B: return "8x7B";
3849
+ case MODEL_8x22B: return "8x22B";
3850
+ case MODEL_16x12B: return "16x12B";
3851
+ case MODEL_10B_128x3_66B: return "10B+128x3.66B";
3852
+ default: return "?B";
3740
3853
  }
3741
3854
  }
3742
3855
 
@@ -3779,6 +3892,12 @@ static void llm_load_hparams(
3779
3892
 
3780
3893
  // get hparams kv
3781
3894
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3895
+
3896
+ // everything past this point is not vocab-related
3897
+ if (hparams.vocab_only) {
3898
+ return;
3899
+ }
3900
+
3782
3901
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3783
3902
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3784
3903
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3823,6 +3942,8 @@ static void llm_load_hparams(
3823
3942
  }
3824
3943
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
3825
3944
 
3945
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
3946
+
3826
3947
  // sanity check for n_rot (optional)
3827
3948
  {
3828
3949
  hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
@@ -3860,7 +3981,7 @@ static void llm_load_hparams(
3860
3981
  switch (hparams.n_layer) {
3861
3982
  case 22: model.type = e_model::MODEL_1B; break;
3862
3983
  case 26: model.type = e_model::MODEL_3B; break;
3863
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3984
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3864
3985
  case 40: model.type = e_model::MODEL_13B; break;
3865
3986
  case 48: model.type = e_model::MODEL_34B; break;
3866
3987
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3922,14 +4043,6 @@ static void llm_load_hparams(
3922
4043
  default: model.type = e_model::MODEL_UNKNOWN;
3923
4044
  }
3924
4045
  } break;
3925
- case LLM_ARCH_PERSIMMON:
3926
- {
3927
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3928
- switch (hparams.n_layer) {
3929
- case 36: model.type = e_model::MODEL_8B; break;
3930
- default: model.type = e_model::MODEL_UNKNOWN;
3931
- }
3932
- } break;
3933
4046
  case LLM_ARCH_REFACT:
3934
4047
  {
3935
4048
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3962,6 +4075,19 @@ static void llm_load_hparams(
3962
4075
  model.type = e_model::MODEL_335M; break; // bge-large
3963
4076
  }
3964
4077
  } break;
4078
+ case LLM_ARCH_JINA_BERT_V2:
4079
+ {
4080
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4081
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4082
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4083
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4084
+ hparams.f_max_alibi_bias = 8.0f;
4085
+
4086
+ switch (hparams.n_layer) {
4087
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4088
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4089
+ }
4090
+ } break;
3965
4091
  case LLM_ARCH_NOMIC_BERT:
3966
4092
  {
3967
4093
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4058,6 +4184,7 @@ static void llm_load_hparams(
4058
4184
  switch (hparams.n_layer) {
4059
4185
  case 24: model.type = e_model::MODEL_1B; break;
4060
4186
  case 32: model.type = e_model::MODEL_3B; break;
4187
+ case 40: model.type = e_model::MODEL_14B; break;
4061
4188
  default: model.type = e_model::MODEL_UNKNOWN;
4062
4189
  }
4063
4190
  } break;
@@ -4198,6 +4325,65 @@ static void llm_load_hparams(
4198
4325
  default: model.type = e_model::MODEL_UNKNOWN;
4199
4326
  }
4200
4327
  } break;
4328
+ case LLM_ARCH_GPTNEOX:
4329
+ {
4330
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4331
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
4332
+ switch (hparams.n_layer) {
4333
+ case 6:
4334
+ switch (hparams.n_ff) {
4335
+ case 512: model.type = e_model::MODEL_14M; break;
4336
+ case 2048: model.type = e_model::MODEL_70M; break;
4337
+ default: model.type = e_model::MODEL_UNKNOWN;
4338
+ } break;
4339
+ case 12:
4340
+ switch (hparams.n_ff) {
4341
+ case 3072: model.type = e_model::MODEL_160M; break;
4342
+ default: model.type = e_model::MODEL_UNKNOWN;
4343
+ } break;
4344
+ case 16:
4345
+ switch (hparams.n_ff) {
4346
+ case 8192: model.type = e_model::MODEL_1B; break;
4347
+ default: model.type = e_model::MODEL_UNKNOWN;
4348
+ } break;
4349
+ case 24:
4350
+ switch (hparams.n_ff) {
4351
+ case 4096: model.type = e_model::MODEL_410M; break;
4352
+ case 8192: model.type = e_model::MODEL_1_4B; break;
4353
+ default: model.type = e_model::MODEL_UNKNOWN;
4354
+ } break;
4355
+ case 32:
4356
+ switch (hparams.n_ff) {
4357
+ case 10240: model.type = e_model::MODEL_2_8B; break;
4358
+ case 16384: model.type = e_model::MODEL_6_9B; break;
4359
+ default: model.type = e_model::MODEL_UNKNOWN;
4360
+ } break;
4361
+ case 36:
4362
+ switch (hparams.n_ff) {
4363
+ case 20480: model.type = e_model::MODEL_12B; break;
4364
+ default: model.type = e_model::MODEL_UNKNOWN;
4365
+ } break;
4366
+ case 44:
4367
+ switch (hparams.n_ff) {
4368
+ case 24576: model.type = e_model::MODEL_20B; break;
4369
+ default: model.type = e_model::MODEL_UNKNOWN;
4370
+ } break;
4371
+ default: model.type = e_model::MODEL_UNKNOWN;
4372
+ }
4373
+ } break;
4374
+ case LLM_ARCH_ARCTIC:
4375
+ {
4376
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4377
+
4378
+ if (hparams.n_expert == 128) {
4379
+ switch (hparams.n_layer) {
4380
+ case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
4381
+ default: model.type = e_model::MODEL_UNKNOWN;
4382
+ }
4383
+ } else {
4384
+ model.type = e_model::MODEL_UNKNOWN;
4385
+ }
4386
+ } break;
4201
4387
  default: (void)0;
4202
4388
  }
4203
4389
 
@@ -4383,7 +4569,11 @@ static void llm_load_vocab(
4383
4569
  tokenizer_pre == "starcoder") {
4384
4570
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
4571
  } else if (
4386
- tokenizer_pre == "gpt-2") {
4572
+ tokenizer_pre == "gpt-2" ||
4573
+ tokenizer_pre == "jina-es" ||
4574
+ tokenizer_pre == "jina-de" ||
4575
+ tokenizer_pre == "jina-v2-es" ||
4576
+ tokenizer_pre == "jina-v2-de") {
4387
4577
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
4578
  } else if (
4389
4579
  tokenizer_pre == "refact") {
@@ -4394,12 +4584,18 @@ static void llm_load_vocab(
4394
4584
  } else if (
4395
4585
  tokenizer_pre == "qwen2") {
4396
4586
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4587
+ } else if (
4588
+ tokenizer_pre == "stablelm2") {
4589
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
4397
4590
  } else if (
4398
4591
  tokenizer_pre == "olmo") {
4399
4592
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4400
4593
  } else if (
4401
4594
  tokenizer_pre == "dbrx") {
4402
4595
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4596
+ } else if (
4597
+ tokenizer_pre == "smaug-bpe") {
4598
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4403
4599
  } else {
4404
4600
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4405
4601
  }
@@ -4515,7 +4711,8 @@ static void llm_load_vocab(
4515
4711
  (t.first == "<|eot_id|>" ||
4516
4712
  t.first == "<|im_end|>" ||
4517
4713
  t.first == "<|end|>" ||
4518
- t.first == "<end_of_turn>"
4714
+ t.first == "<end_of_turn>" ||
4715
+ t.first == "<|endoftext|>"
4519
4716
  )
4520
4717
  ) {
4521
4718
  vocab.special_eot_id = t.second;
@@ -4743,13 +4940,13 @@ static bool llm_load_tensors(
4743
4940
 
4744
4941
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4745
4942
  // calculate the split points
4746
- int device_count = llama_get_device_count();
4943
+ int device_count = llama_get_device_count(model);
4747
4944
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4748
4945
  std::vector<float> splits(device_count);
4749
4946
  if (all_zero) {
4750
4947
  // default split, by free memory
4751
4948
  for (int i = 0; i < device_count; ++i) {
4752
- splits[i] = llama_get_device_memory(i);
4949
+ splits[i] = llama_get_device_memory(model, i);
4753
4950
  }
4754
4951
  } else {
4755
4952
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4769,35 +4966,35 @@ static bool llm_load_tensors(
4769
4966
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4770
4967
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4771
4968
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4772
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4969
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4773
4970
  }
4774
4971
  // assign the output layer
4775
4972
  if (n_gpu_layers > n_layer) {
4776
4973
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4777
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4974
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4778
4975
  } else {
4779
4976
  model.buft_output = llama_default_buffer_type_cpu(true);
4780
4977
  }
4781
4978
  } else {
4782
4979
  ggml_backend_buffer_type_t split_buft;
4783
4980
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4784
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4981
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4785
4982
  } else {
4786
4983
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4787
- split_buft = llama_default_buffer_type_offload(main_gpu);
4984
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4788
4985
  }
4789
4986
  // assign the repeating layers
4790
4987
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4791
4988
  model.buft_layer[i] = {
4792
4989
  split_buft,
4793
- llama_default_buffer_type_offload(main_gpu)
4990
+ llama_default_buffer_type_offload(model, main_gpu)
4794
4991
  };
4795
4992
  }
4796
4993
  // assign the output layer
4797
4994
  if (n_gpu_layers > n_layer) {
4798
4995
  model.buft_output = {
4799
4996
  split_buft,
4800
- llama_default_buffer_type_offload(main_gpu)
4997
+ llama_default_buffer_type_offload(model, main_gpu)
4801
4998
  };
4802
4999
  } else {
4803
5000
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -4841,6 +5038,7 @@ static bool llm_load_tensors(
4841
5038
  // create tensors for the weights
4842
5039
  {
4843
5040
  const int64_t n_embd = hparams.n_embd;
5041
+ const int64_t n_embd_head = n_embd / hparams.n_head;
4844
5042
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4845
5043
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4846
5044
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -4875,12 +5073,10 @@ static bool llm_load_tensors(
4875
5073
  {
4876
5074
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4877
5075
  if (model.arch != LLM_ARCH_MINICPM){
4878
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5076
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4879
5077
  // if output is NULL, init from the input tok embed
4880
5078
  if (model.output == NULL) {
4881
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4882
- ml.n_created--; // artificial tensor
4883
- ml.size_data += ggml_nbytes(model.output);
5079
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4884
5080
  }
4885
5081
  }
4886
5082
  }
@@ -4899,10 +5095,10 @@ static bool llm_load_tensors(
4899
5095
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4900
5096
 
4901
5097
  // optional bias tensors
4902
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
4903
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4904
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4905
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5098
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5099
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5100
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5101
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
4906
5102
 
4907
5103
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4908
5104
 
@@ -4913,7 +5109,7 @@ static bool llm_load_tensors(
4913
5109
  } else {
4914
5110
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4915
5111
 
4916
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5112
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4917
5113
  if (layer.ffn_gate_exps) {
4918
5114
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4919
5115
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -4955,12 +5151,10 @@ static bool llm_load_tensors(
4955
5151
  // output
4956
5152
  {
4957
5153
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4958
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5154
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
4959
5155
  // if output is NULL, init from the input tok embed
4960
5156
  if (model.output == NULL) {
4961
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4962
- ml.n_created--; // artificial tensor
4963
- ml.size_data += ggml_nbytes(model.output);
5157
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
4964
5158
  }
4965
5159
  }
4966
5160
 
@@ -4983,7 +5177,7 @@ static bool llm_load_tensors(
4983
5177
 
4984
5178
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4985
5179
 
4986
- layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
5180
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
4987
5181
  if (layer.ffn_gate_exps) {
4988
5182
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4989
5183
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
@@ -5085,11 +5279,9 @@ static bool llm_load_tensors(
5085
5279
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5086
5280
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5087
5281
 
5088
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5282
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5089
5283
  if (!model.output) {
5090
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5091
- ml.n_created--; // artificial tensor
5092
- ml.size_data += ggml_nbytes(model.output);
5284
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5093
5285
  }
5094
5286
  }
5095
5287
 
@@ -5102,8 +5294,8 @@ static bool llm_load_tensors(
5102
5294
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5103
5295
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5104
5296
 
5105
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
5106
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
5297
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5298
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5107
5299
 
5108
5300
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5109
5301
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -5121,7 +5313,12 @@ static bool llm_load_tensors(
5121
5313
  {
5122
5314
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5123
5315
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5124
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5316
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5317
+ if (!model.output) {
5318
+ // needs to be on GPU
5319
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5320
+ }
5321
+
5125
5322
  }
5126
5323
 
5127
5324
  for (int i = 0; i < n_layer; ++i) {
@@ -5149,47 +5346,6 @@ static bool llm_load_tensors(
5149
5346
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5150
5347
  }
5151
5348
  } break;
5152
- case LLM_ARCH_PERSIMMON:
5153
- {
5154
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5155
-
5156
- {
5157
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5158
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5159
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5160
- }
5161
-
5162
- for (int i = 0; i < n_layer; ++i) {
5163
- ggml_context * ctx_layer = ctx_for_layer(i);
5164
- ggml_context * ctx_split = ctx_for_layer_split(i);
5165
-
5166
- auto & layer = model.layers[i];
5167
-
5168
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5169
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5170
-
5171
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5172
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
5173
-
5174
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5175
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
5176
-
5177
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5178
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5179
-
5180
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5181
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5182
-
5183
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5184
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5185
-
5186
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
5187
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
5188
-
5189
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
5190
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
5191
- }
5192
- } break;
5193
5349
  case LLM_ARCH_BERT:
5194
5350
  case LLM_ARCH_NOMIC_BERT:
5195
5351
  {
@@ -5242,6 +5398,50 @@ static bool llm_load_tensors(
5242
5398
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5243
5399
  }
5244
5400
  } break;
5401
+ case LLM_ARCH_JINA_BERT_V2:
5402
+ {
5403
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5404
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5405
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5406
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5407
+
5408
+ for (int i = 0; i < n_layer; ++i) {
5409
+ ggml_context * ctx_layer = ctx_for_layer(i);
5410
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5411
+
5412
+ auto & layer = model.layers[i]; // JinaBertLayer
5413
+
5414
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5415
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5416
+
5417
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5418
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5419
+
5420
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5421
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5422
+
5423
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5424
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5425
+
5426
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5427
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5428
+
5429
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5430
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5431
+
5432
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5433
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5434
+
5435
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5436
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5437
+
5438
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5439
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5440
+
5441
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5442
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5443
+ }
5444
+ } break;
5245
5445
  case LLM_ARCH_BLOOM:
5246
5446
  {
5247
5447
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5283,18 +5483,16 @@ static bool llm_load_tensors(
5283
5483
  case LLM_ARCH_MPT:
5284
5484
  {
5285
5485
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5286
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
5486
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
5287
5487
 
5288
5488
  // output
5289
5489
  {
5290
5490
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5291
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
5491
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5292
5492
 
5293
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5493
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5294
5494
  if (!model.output) {
5295
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5296
- ml.n_created--; // artificial tensor
5297
- ml.size_data += ggml_nbytes(model.output);
5495
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
5298
5496
  }
5299
5497
  }
5300
5498
 
@@ -5305,31 +5503,31 @@ static bool llm_load_tensors(
5305
5503
  auto & layer = model.layers[i];
5306
5504
 
5307
5505
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5308
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
5506
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5309
5507
 
5310
5508
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
5311
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5509
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5312
5510
 
5313
5511
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5314
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
5512
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5315
5513
 
5316
5514
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5317
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5515
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5318
5516
 
5319
5517
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5320
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5518
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5321
5519
 
5322
5520
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5323
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5521
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5324
5522
 
5325
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5326
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5523
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5524
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5327
5525
 
5328
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5329
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5526
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5527
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5330
5528
 
5331
5529
  // AWQ ScaleActivation layer
5332
- layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
5530
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5333
5531
  }
5334
5532
  } break;
5335
5533
  case LLM_ARCH_STABLELM:
@@ -5358,17 +5556,17 @@ static bool llm_load_tensors(
5358
5556
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5359
5557
 
5360
5558
  // optional bias tensors, present in Stable LM 2 1.6B
5361
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
5362
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
5363
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
5559
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5560
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5561
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5364
5562
 
5365
5563
  // optional q and k layernorms, present in StableLM 2 12B
5366
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5367
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5564
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
5565
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
5368
5566
 
5369
5567
  // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5370
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5371
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
5568
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5569
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5372
5570
 
5373
5571
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5374
5572
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5411,12 +5609,10 @@ static bool llm_load_tensors(
5411
5609
  // output
5412
5610
  {
5413
5611
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5414
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5612
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5415
5613
  // if output is NULL, init from the input tok embed
5416
5614
  if (model.output == NULL) {
5417
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5418
- ml.n_created--; // artificial tensor
5419
- ml.size_data += ggml_nbytes(model.output);
5615
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5420
5616
  }
5421
5617
  }
5422
5618
 
@@ -5514,8 +5710,8 @@ static bool llm_load_tensors(
5514
5710
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5515
5711
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
5516
5712
 
5517
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
5518
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
5713
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5714
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
5519
5715
 
5520
5716
  if (layer.wqkv == nullptr) {
5521
5717
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -5552,17 +5748,20 @@ static bool llm_load_tensors(
5552
5748
  ggml_context* ctx_layer = ctx_for_layer(i);
5553
5749
  ggml_context* ctx_split = ctx_for_layer_split(i);
5554
5750
 
5555
- auto& layer = model.layers[i];
5751
+ auto & layer = model.layers[i];
5556
5752
 
5557
5753
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5558
5754
 
5559
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5560
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5755
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
5756
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5561
5757
 
5562
5758
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5563
5759
 
5564
5760
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5565
5761
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5762
+
5763
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5764
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
5566
5765
  }
5567
5766
  } break;
5568
5767
  case LLM_ARCH_PLAMO:
@@ -5731,9 +5930,7 @@ static bool llm_load_tensors(
5731
5930
 
5732
5931
  // output
5733
5932
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5734
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5735
- ml.n_created--; // artificial tensor
5736
- ml.size_data += ggml_nbytes(model.output);
5933
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
5737
5934
 
5738
5935
  const int64_t n_ff = hparams.n_ff;
5739
5936
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -5768,12 +5965,10 @@ static bool llm_load_tensors(
5768
5965
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5769
5966
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
5770
5967
 
5771
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5968
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5772
5969
  // if output is NULL, init from the input tok embed
5773
5970
  if (model.output == NULL) {
5774
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5775
- ml.n_created--; // artificial tensor
5776
- ml.size_data += ggml_nbytes(model.output);
5971
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5777
5972
  }
5778
5973
 
5779
5974
  }
@@ -5824,12 +6019,10 @@ static bool llm_load_tensors(
5824
6019
  {
5825
6020
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5826
6021
 
5827
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6022
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5828
6023
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
5829
6024
  if (model.output == NULL) {
5830
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5831
- ml.n_created--; // artificial tensor
5832
- ml.size_data += ggml_nbytes(model.output);
6025
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5833
6026
  }
5834
6027
  }
5835
6028
 
@@ -5890,9 +6083,7 @@ static bool llm_load_tensors(
5890
6083
  {
5891
6084
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5892
6085
  // init output from the input tok embed
5893
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5894
- ml.n_created--; // artificial tensor
5895
- ml.size_data += ggml_nbytes(model.output);
6086
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5896
6087
  }
5897
6088
 
5898
6089
  for (int i = 0; i < n_layer; ++i) {
@@ -5924,12 +6115,10 @@ static bool llm_load_tensors(
5924
6115
 
5925
6116
  // output
5926
6117
  {
5927
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
6118
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5928
6119
  // if output is NULL, init from the input tok embed
5929
6120
  if (model.output == NULL) {
5930
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5931
- ml.n_created--; // artificial tensor
5932
- ml.size_data += ggml_nbytes(model.output);
6121
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5933
6122
  }
5934
6123
  }
5935
6124
 
@@ -5949,6 +6138,81 @@ static bool llm_load_tensors(
5949
6138
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5950
6139
  }
5951
6140
  } break;
6141
+ case LLM_ARCH_GPTNEOX:
6142
+ {
6143
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6144
+ // output
6145
+ {
6146
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6147
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
6148
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6149
+ }
6150
+
6151
+ for (int i = 0; i < n_layer; ++i) {
6152
+ ggml_context * ctx_layer = ctx_for_layer(i);
6153
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6154
+
6155
+ auto & layer = model.layers[i];
6156
+
6157
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6158
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
6159
+
6160
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
6161
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
6162
+
6163
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6164
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
6165
+
6166
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6167
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
6168
+
6169
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
6170
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
6171
+
6172
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6173
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
6174
+ }
6175
+ } break;
6176
+ case LLM_ARCH_ARCTIC:
6177
+ {
6178
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6179
+
6180
+ // output
6181
+ {
6182
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6183
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6184
+ // if output is NULL, init from the input tok embed
6185
+ if (model.output == NULL) {
6186
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6187
+ }
6188
+ }
6189
+
6190
+ for (int i = 0; i < n_layer; ++i) {
6191
+ ggml_context * ctx_layer = ctx_for_layer(i);
6192
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6193
+
6194
+ auto & layer = model.layers[i];
6195
+
6196
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6197
+
6198
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6199
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6200
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6201
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6202
+
6203
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6204
+
6205
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
6206
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
6207
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
6208
+
6209
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6210
+ layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
6211
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
6212
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
6213
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6214
+ }
6215
+ } break;
5952
6216
  default:
5953
6217
  throw std::runtime_error("unknown architecture");
5954
6218
  }
@@ -6213,10 +6477,7 @@ static struct ggml_tensor * llm_build_inp_embd(
6213
6477
 
6214
6478
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
6215
6479
  } else {
6216
- #ifdef GGML_USE_MPI
6217
- GGML_ASSERT(false && "not implemented");
6218
- #endif
6219
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6480
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
6220
6481
  inpL = lctx.inp_embd;
6221
6482
  ggml_set_input(lctx.inp_embd);
6222
6483
  }
@@ -6318,7 +6579,7 @@ static struct ggml_tensor * llm_build_ffn(
6318
6579
  llm_ffn_gate_type type_gate,
6319
6580
  const llm_build_cb & cb,
6320
6581
  int il) {
6321
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6582
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6322
6583
  cb(tmp, "ffn_up", il);
6323
6584
 
6324
6585
  if (up_b) {
@@ -6500,7 +6761,6 @@ static struct ggml_tensor * llm_build_kqv(
6500
6761
  struct ggml_tensor * wo_b,
6501
6762
  struct ggml_tensor * q_cur,
6502
6763
  struct ggml_tensor * kq_mask,
6503
- struct ggml_tensor * kq_pos,
6504
6764
  int32_t n_tokens,
6505
6765
  int32_t n_kv,
6506
6766
  float kq_scale,
@@ -6512,6 +6772,7 @@ static struct ggml_tensor * llm_build_kqv(
6512
6772
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6513
6773
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6514
6774
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6775
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6515
6776
 
6516
6777
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6517
6778
  cb(q, "q", il);
@@ -6530,31 +6791,27 @@ static struct ggml_tensor * llm_build_kqv(
6530
6791
  GGML_UNUSED(model);
6531
6792
  GGML_UNUSED(n_ctx);
6532
6793
 
6533
- // note: if this assert triggers, then some check has failed earlier
6534
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6536
-
6537
6794
  // split cached v into n_head heads (not transposed)
6538
6795
  struct ggml_tensor * v =
6539
6796
  ggml_view_3d(ctx, kv.v_l[il],
6540
6797
  n_embd_head_v, n_kv, n_head_kv,
6541
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6798
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6799
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6543
6800
  0);
6544
6801
  cb(v, "v", il);
6545
6802
 
6546
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6803
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6547
6804
 
6548
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6805
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6549
6806
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
6807
  }
6551
6808
 
6552
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6809
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6553
6810
  } else {
6554
6811
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
6812
  cb(kq, "kq", il);
6556
6813
 
6557
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6814
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
6558
6815
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6559
6816
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6560
6817
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6574,28 +6831,8 @@ static struct ggml_tensor * llm_build_kqv(
6574
6831
  kq = ggml_scale(ctx, kq, 30);
6575
6832
  }
6576
6833
 
6577
- #if defined(GGML_USE_KOMPUTE)
6578
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6579
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6580
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6581
- if (hparams.use_alibi) {
6582
- kq = ggml_scale(ctx, kq, kq_scale);
6583
- cb(kq, "kq_scaled", il);
6584
-
6585
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
- cb(kq, "kq_scaled_alibi", il);
6587
-
6588
- kq = ggml_add(ctx, kq, kq_mask);
6589
- cb(kq, "kq_masked", il);
6590
-
6591
- kq = ggml_soft_max(ctx, kq);
6592
- cb(kq, "kq_soft_max", il);
6593
- } else
6594
- #endif
6595
- {
6596
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
- cb(kq, "kq_soft_max_ext", il);
6598
- }
6834
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6835
+ cb(kq, "kq_soft_max_ext", il);
6599
6836
 
6600
6837
  GGML_ASSERT(kv.size == n_ctx);
6601
6838
 
@@ -6614,7 +6851,7 @@ static struct ggml_tensor * llm_build_kqv(
6614
6851
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
6852
  cb(kqv_merged, "kqv_merged", il);
6616
6853
 
6617
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6854
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6618
6855
  cb(cur, "kqv_merged_cont", il);
6619
6856
  }
6620
6857
 
@@ -6645,7 +6882,6 @@ static struct ggml_tensor * llm_build_kv(
6645
6882
  struct ggml_tensor * v_cur,
6646
6883
  struct ggml_tensor * q_cur,
6647
6884
  struct ggml_tensor * kq_mask,
6648
- struct ggml_tensor * kq_pos,
6649
6885
  int32_t n_tokens,
6650
6886
  int32_t kv_head,
6651
6887
  int32_t n_kv,
@@ -6664,7 +6900,7 @@ static struct ggml_tensor * llm_build_kv(
6664
6900
  struct ggml_tensor * cur;
6665
6901
 
6666
6902
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6903
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6668
6904
  cb(cur, "kqv_out", il);
6669
6905
 
6670
6906
  return cur;
@@ -6771,18 +7007,17 @@ struct llm_build_context {
6771
7007
 
6772
7008
  ctx0 = ggml_init(params);
6773
7009
 
6774
- lctx.inp_tokens = nullptr;
6775
- lctx.inp_embd = nullptr;
6776
- lctx.inp_pos = nullptr;
7010
+ lctx.inp_tokens = nullptr;
7011
+ lctx.inp_embd = nullptr;
7012
+ lctx.inp_pos = nullptr;
6777
7013
  lctx.inp_out_ids = nullptr;
6778
7014
  lctx.inp_KQ_mask = nullptr;
6779
- lctx.inp_KQ_pos = nullptr;
6780
7015
  lctx.inp_K_shift = nullptr;
6781
- lctx.inp_mean = nullptr;
6782
- lctx.inp_cls = nullptr;
6783
- lctx.inp_s_copy = nullptr;
6784
- lctx.inp_s_mask = nullptr;
6785
- lctx.inp_s_seq = nullptr;
7016
+ lctx.inp_mean = nullptr;
7017
+ lctx.inp_cls = nullptr;
7018
+ lctx.inp_s_copy = nullptr;
7019
+ lctx.inp_s_mask = nullptr;
7020
+ lctx.inp_s_seq = nullptr;
6786
7021
  }
6787
7022
 
6788
7023
  void free() {
@@ -6801,17 +7036,20 @@ struct llm_build_context {
6801
7036
  cb(lctx.inp_K_shift, "K_shift", -1);
6802
7037
  ggml_set_input(lctx.inp_K_shift);
6803
7038
 
7039
+
6804
7040
  for (int il = 0; il < n_layer; ++il) {
7041
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
6805
7042
  struct ggml_tensor * tmp =
6806
7043
  // we rotate only the first n_rot dimensions
6807
- ggml_rope_custom_inplace(ctx0,
7044
+ ggml_rope_ext_inplace(ctx0,
6808
7045
  ggml_view_3d(ctx0, kv_self.k_l[il],
6809
7046
  n_embd_head_k, n_head_kv, n_ctx,
6810
7047
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
6811
7048
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6812
7049
  0),
6813
- lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7050
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6814
7051
  ext_factor, attn_factor, beta_fast, beta_slow);
7052
+
6815
7053
  cb(tmp, "K_shifted", il);
6816
7054
  ggml_build_forward_expand(gf, tmp);
6817
7055
  }
@@ -6914,6 +7152,17 @@ struct llm_build_context {
6914
7152
  return lctx.inp_pos;
6915
7153
  }
6916
7154
 
7155
+ struct ggml_tensor * build_rope_factors(int il) {
7156
+ // choose long/short freq factors based on the context size
7157
+ const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7158
+
7159
+ if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7160
+ return model.layers[il].rope_long;
7161
+ }
7162
+
7163
+ return model.layers[il].rope_short;
7164
+ }
7165
+
6917
7166
  struct ggml_tensor * build_inp_out_ids() {
6918
7167
  lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6919
7168
  cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -6932,19 +7181,6 @@ struct llm_build_context {
6932
7181
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6933
7182
  }
6934
7183
 
6935
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
- if (causal) {
6937
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
- } else {
6939
- // TODO: this will be needed for ALiBi-based BERT models
6940
- // https://github.com/ggerganov/llama.cpp/pull/6826
6941
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
- }
6943
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6944
- ggml_set_input(lctx.inp_KQ_pos);
6945
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6946
- }
6947
-
6948
7184
  struct ggml_tensor * build_inp_mean() {
6949
7185
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6950
7186
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7034,15 +7270,15 @@ struct llm_build_context {
7034
7270
  cb(Vcur, "Vcur", il);
7035
7271
  }
7036
7272
 
7037
- Qcur = ggml_rope_custom(
7038
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7273
+ Qcur = ggml_rope_ext(
7274
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7039
7275
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7040
7276
  ext_factor, attn_factor, beta_fast, beta_slow
7041
7277
  );
7042
7278
  cb(Qcur, "Qcur", il);
7043
7279
 
7044
- Kcur = ggml_rope_custom(
7045
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7280
+ Kcur = ggml_rope_ext(
7281
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7046
7282
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7047
7283
  ext_factor, attn_factor, beta_fast, beta_slow
7048
7284
  );
@@ -7050,7 +7286,7 @@ struct llm_build_context {
7050
7286
 
7051
7287
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7052
7288
  model.layers[il].wo, model.layers[il].bo,
7053
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7289
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7054
7290
  }
7055
7291
 
7056
7292
  if (il == n_layer - 1) {
@@ -7143,9 +7379,6 @@ struct llm_build_context {
7143
7379
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7144
7380
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7145
7381
 
7146
- // positions of the tokens in the KV cache
7147
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7148
-
7149
7382
  for (int il = 0; il < n_layer; ++il) {
7150
7383
  struct ggml_tensor * inpSA = inpL;
7151
7384
 
@@ -7167,13 +7400,13 @@ struct llm_build_context {
7167
7400
 
7168
7401
  switch (model.type) {
7169
7402
  case MODEL_7B:
7170
- Qcur = ggml_rope_custom(
7171
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7403
+ Qcur = ggml_rope_ext(
7404
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7172
7405
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7173
7406
  ext_factor, attn_factor, beta_fast, beta_slow
7174
7407
  );
7175
- Kcur = ggml_rope_custom(
7176
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7408
+ Kcur = ggml_rope_ext(
7409
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7177
7410
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7178
7411
  ext_factor, attn_factor, beta_fast, beta_slow
7179
7412
  );
@@ -7190,7 +7423,7 @@ struct llm_build_context {
7190
7423
 
7191
7424
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7192
7425
  model.layers[il].wo, NULL,
7193
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7426
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7194
7427
  }
7195
7428
 
7196
7429
  if (il == n_layer - 1) {
@@ -7260,9 +7493,6 @@ struct llm_build_context {
7260
7493
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7261
7494
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7262
7495
 
7263
- // positions of the tokens in the KV cache
7264
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7265
-
7266
7496
  for (int il = 0; il < n_layer; ++il) {
7267
7497
  struct ggml_tensor * inpSA = inpL;
7268
7498
 
@@ -7282,22 +7512,22 @@ struct llm_build_context {
7282
7512
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7283
7513
  cb(Vcur, "Vcur", il);
7284
7514
 
7285
- Qcur = ggml_rope_custom(
7286
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7515
+ Qcur = ggml_rope_ext(
7516
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7287
7517
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7288
7518
  ext_factor, attn_factor, beta_fast, beta_slow
7289
7519
  );
7290
7520
  cb(Qcur, "Qcur", il);
7291
7521
 
7292
- Kcur = ggml_rope_custom(
7293
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7522
+ Kcur = ggml_rope_ext(
7523
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7294
7524
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7295
7525
  ext_factor, attn_factor, beta_fast, beta_slow
7296
7526
  );
7297
7527
  cb(Kcur, "Kcur", il);
7298
7528
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7299
7529
  model.layers[il].wo, NULL,
7300
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7530
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7301
7531
  }
7302
7532
 
7303
7533
  if (il == n_layer - 1) {
@@ -7403,21 +7633,21 @@ struct llm_build_context {
7403
7633
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7404
7634
 
7405
7635
  // using mode = 2 for neox mode
7406
- Qcur = ggml_rope_custom(
7407
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7636
+ Qcur = ggml_rope_ext(
7637
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7408
7638
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7409
7639
  );
7410
7640
  cb(Qcur, "Qcur", il);
7411
7641
 
7412
- Kcur = ggml_rope_custom(
7413
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7642
+ Kcur = ggml_rope_ext(
7643
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7414
7644
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7415
7645
  );
7416
7646
  cb(Kcur, "Kcur", il);
7417
7647
 
7418
7648
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7419
7649
  model.layers[il].wo, NULL,
7420
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7650
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7421
7651
  }
7422
7652
 
7423
7653
  if (il == n_layer - 1) {
@@ -7526,15 +7756,15 @@ struct llm_build_context {
7526
7756
  cb(Vcur, "Vcur", il);
7527
7757
  }
7528
7758
 
7529
- Qcur = ggml_rope_custom(
7530
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7759
+ Qcur = ggml_rope_ext(
7760
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7531
7761
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7532
7762
  ext_factor, attn_factor, beta_fast, beta_slow
7533
7763
  );
7534
7764
  cb(Qcur, "Qcur", il);
7535
7765
 
7536
- Kcur = ggml_rope_custom(
7537
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7766
+ Kcur = ggml_rope_ext(
7767
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7538
7768
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7539
7769
  ext_factor, attn_factor, beta_fast, beta_slow
7540
7770
  );
@@ -7542,7 +7772,7 @@ struct llm_build_context {
7542
7772
 
7543
7773
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7544
7774
  model.layers[il].wo, model.layers[il].bo,
7545
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7775
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7546
7776
  }
7547
7777
 
7548
7778
  if (il == n_layer - 1) {
@@ -7678,15 +7908,15 @@ struct llm_build_context {
7678
7908
  cb(Kcur, "Kcur", il);
7679
7909
  cb(Vcur, "Vcur", il);
7680
7910
 
7681
- Qcur = ggml_rope_custom(
7682
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7911
+ Qcur = ggml_rope_ext(
7912
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7683
7913
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7684
7914
  ext_factor, attn_factor, beta_fast, beta_slow
7685
7915
  );
7686
7916
  cb(Qcur, "Qcur", il);
7687
7917
 
7688
- Kcur = ggml_rope_custom(
7689
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7918
+ Kcur = ggml_rope_ext(
7919
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7690
7920
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7691
7921
  ext_factor, attn_factor, beta_fast, beta_slow
7692
7922
  );
@@ -7694,7 +7924,7 @@ struct llm_build_context {
7694
7924
 
7695
7925
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
7926
  model.layers[il].wo, NULL,
7697
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7927
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7698
7928
  }
7699
7929
 
7700
7930
  if (il == n_layer - 1) {
@@ -7806,7 +8036,7 @@ struct llm_build_context {
7806
8036
 
7807
8037
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7808
8038
  model.layers[il].wo, model.layers[il].bo,
7809
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8039
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7810
8040
  }
7811
8041
 
7812
8042
  if (il == n_layer - 1) {
@@ -7855,259 +8085,49 @@ struct llm_build_context {
7855
8085
  return gf;
7856
8086
  }
7857
8087
 
7858
- struct ggml_cgraph * build_persimmon() {
8088
+ struct ggml_cgraph * build_refact() {
7859
8089
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7860
8090
 
7861
8091
  const int64_t n_embd_head = hparams.n_embd_head_v;
7862
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7863
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
8092
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7864
8093
 
7865
8094
  struct ggml_tensor * cur;
7866
8095
  struct ggml_tensor * inpL;
7867
8096
 
7868
8097
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7869
8098
 
7870
- // inp_pos - contains the positions
7871
- struct ggml_tensor * inp_pos = build_inp_pos();
7872
-
7873
8099
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7874
8100
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7875
8101
 
7876
8102
  for (int il = 0; il < n_layer; ++il) {
7877
- struct ggml_tensor * residual = inpL;
8103
+ struct ggml_tensor * inpSA = inpL;
7878
8104
 
7879
8105
  cur = llm_build_norm(ctx0, inpL, hparams,
7880
- model.layers[il].attn_norm,
7881
- model.layers[il].attn_norm_b,
7882
- LLM_NORM, cb, il);
8106
+ model.layers[il].attn_norm, NULL,
8107
+ LLM_NORM_RMS, cb, il);
7883
8108
  cb(cur, "attn_norm", il);
7884
8109
 
7885
- // self attention
8110
+ // self-attention
7886
8111
  {
7887
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7888
- cb(cur, "wqkv", il);
7889
-
7890
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7891
- cb(cur, "bqkv", il);
7892
-
7893
- // split qkv
7894
- GGML_ASSERT(n_head_kv == n_head);
8112
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8113
+ cb(Qcur, "Qcur", il);
7895
8114
 
7896
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7897
- cb(tmpqkv, "tmpqkv", il);
8115
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8116
+ cb(Kcur, "Kcur", il);
7898
8117
 
7899
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7900
- cb(tmpqkv_perm, "tmpqkv", il);
8118
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8119
+ cb(Vcur, "Vcur", il);
7901
8120
 
7902
- struct ggml_tensor * tmpq = ggml_view_3d(
7903
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7904
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7905
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7906
- 0
7907
- );
7908
- cb(tmpq, "tmpq", il);
8121
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8122
+ cb(Kcur, "Kcur", il);
7909
8123
 
7910
- struct ggml_tensor * tmpk = ggml_view_3d(
7911
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
7912
- ggml_element_size(tmpqkv_perm) * n_embd_head,
7913
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
7914
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
7915
- );
7916
- cb(tmpk, "tmpk", il);
8124
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8125
+ cb(Qcur, "Qcur", il);
7917
8126
 
7918
- // Q/K Layernorm
7919
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
7920
- model.layers[il].attn_q_norm,
7921
- model.layers[il].attn_q_norm_b,
7922
- LLM_NORM, cb, il);
7923
- cb(tmpq, "tmpq", il);
7924
-
7925
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
7926
- model.layers[il].attn_k_norm,
7927
- model.layers[il].attn_k_norm_b,
7928
- LLM_NORM, cb, il);
7929
- cb(tmpk, "tmpk", il);
7930
-
7931
- // RoPE the first n_rot of q/k, pass the other half, and concat.
7932
- struct ggml_tensor * qrot = ggml_view_3d(
7933
- ctx0, tmpq, n_rot, n_head, n_tokens,
7934
- ggml_element_size(tmpq) * n_embd_head,
7935
- ggml_element_size(tmpq) * n_embd_head * n_head,
7936
- 0
7937
- );
7938
- cb(qrot, "qrot", il);
7939
-
7940
- struct ggml_tensor * krot = ggml_view_3d(
7941
- ctx0, tmpk, n_rot, n_head, n_tokens,
7942
- ggml_element_size(tmpk) * n_embd_head,
7943
- ggml_element_size(tmpk) * n_embd_head * n_head,
7944
- 0
7945
- );
7946
- cb(krot, "krot", il);
7947
-
7948
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
7949
- struct ggml_tensor * qpass = ggml_view_3d(
7950
- ctx0, tmpq, n_rot, n_head, n_tokens,
7951
- ggml_element_size(tmpq) * n_embd_head,
7952
- ggml_element_size(tmpq) * n_embd_head * n_head,
7953
- ggml_element_size(tmpq) * n_rot
7954
- );
7955
- cb(qpass, "qpass", il);
7956
-
7957
- struct ggml_tensor * kpass = ggml_view_3d(
7958
- ctx0, tmpk, n_rot, n_head, n_tokens,
7959
- ggml_element_size(tmpk) * n_embd_head,
7960
- ggml_element_size(tmpk) * n_embd_head * n_head,
7961
- ggml_element_size(tmpk) * n_rot
7962
- );
7963
- cb(kpass, "kpass", il);
7964
-
7965
- struct ggml_tensor * qrotated = ggml_rope_custom(
7966
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7967
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7968
- );
7969
- cb(qrotated, "qrotated", il);
7970
-
7971
- struct ggml_tensor * krotated = ggml_rope_custom(
7972
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
7973
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7974
- );
7975
- cb(krotated, "krotated", il);
7976
-
7977
- // ggml currently only supports concatenation on dim=2
7978
- // so we need to permute qrot, qpass, concat, then permute back.
7979
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
7980
- cb(qrotated, "qrotated", il);
7981
-
7982
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
7983
- cb(krotated, "krotated", il);
7984
-
7985
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
7986
- cb(qpass, "qpass", il);
7987
-
7988
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
7989
- cb(kpass, "kpass", il);
7990
-
7991
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
7992
- cb(Qcur, "Qcur", il);
7993
-
7994
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
7995
- cb(Kcur, "Kcur", il);
7996
-
7997
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
7998
- cb(Q, "Q", il);
7999
-
8000
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
8001
- cb(Kcur, "Kcur", il);
8002
-
8003
- struct ggml_tensor * Vcur = ggml_view_3d(
8004
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
8005
- ggml_element_size(tmpqkv_perm) * n_embd_head,
8006
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
8007
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
8008
- );
8009
- cb(Vcur, "Vcur", il);
8010
-
8011
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8012
- model.layers[il].wo, model.layers[il].bo,
8013
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8014
- }
8015
-
8016
- if (il == n_layer - 1) {
8017
- // skip computing output for unused tokens
8018
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8019
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8020
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8021
- }
8022
-
8023
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
8024
- cb(ffn_inp, "ffn_inp", il);
8025
-
8026
- // feed-forward network
8027
- {
8028
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
8029
- model.layers[il].ffn_norm,
8030
- model.layers[il].ffn_norm_b,
8031
- LLM_NORM, cb, il);
8032
- cb(cur, "ffn_norm", il);
8033
-
8034
- cur = llm_build_ffn(ctx0, cur,
8035
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8036
- NULL, NULL,
8037
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8038
- NULL,
8039
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
8040
- cb(cur, "ffn_out", il);
8041
- }
8042
-
8043
- cur = ggml_add(ctx0, cur, ffn_inp);
8044
- cb(cur, "l_out", il);
8045
-
8046
- inpL = cur;
8047
- }
8048
-
8049
- cur = inpL;
8050
-
8051
- cur = llm_build_norm(ctx0, cur, hparams,
8052
- model.output_norm,
8053
- model.output_norm_b,
8054
- LLM_NORM, cb, -1);
8055
- cb(cur, "result_norm", -1);
8056
-
8057
- cur = ggml_mul_mat(ctx0, model.output, cur);
8058
- cb(cur, "result_output", -1);
8059
-
8060
- ggml_build_forward_expand(gf, cur);
8061
-
8062
- return gf;
8063
- }
8064
-
8065
- struct ggml_cgraph * build_refact() {
8066
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8067
-
8068
- const int64_t n_embd_head = hparams.n_embd_head_v;
8069
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8070
-
8071
- struct ggml_tensor * cur;
8072
- struct ggml_tensor * inpL;
8073
-
8074
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8075
-
8076
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8077
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8078
-
8079
- // positions of the tokens in the KV cache
8080
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8081
-
8082
- for (int il = 0; il < n_layer; ++il) {
8083
- struct ggml_tensor * inpSA = inpL;
8084
-
8085
- cur = llm_build_norm(ctx0, inpL, hparams,
8086
- model.layers[il].attn_norm, NULL,
8087
- LLM_NORM_RMS, cb, il);
8088
- cb(cur, "attn_norm", il);
8089
-
8090
- // self-attention
8091
- {
8092
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8093
- cb(Qcur, "Qcur", il);
8094
-
8095
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8096
- cb(Kcur, "Kcur", il);
8097
-
8098
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8099
- cb(Vcur, "Vcur", il);
8100
-
8101
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8102
- cb(Kcur, "Kcur", il);
8103
-
8104
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8105
- cb(Qcur, "Qcur", il);
8106
-
8107
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8108
- model.layers[il].wo, NULL,
8109
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8110
- }
8127
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8128
+ model.layers[il].wo, NULL,
8129
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8130
+ }
8111
8131
 
8112
8132
  if (il == n_layer - 1) {
8113
8133
  // skip computing output for unused tokens
@@ -8168,8 +8188,11 @@ struct llm_build_context {
8168
8188
 
8169
8189
  struct ggml_tensor * cur;
8170
8190
  struct ggml_tensor * inpL;
8191
+ struct ggml_tensor * inp_pos = nullptr;
8171
8192
 
8172
- struct ggml_tensor * inp_pos = build_inp_pos();
8193
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8194
+ inp_pos = build_inp_pos();
8195
+ }
8173
8196
  struct ggml_tensor * inp_mean = build_inp_mean();
8174
8197
  struct ggml_tensor * inp_cls = build_inp_cls();
8175
8198
 
@@ -8200,13 +8223,26 @@ struct llm_build_context {
8200
8223
  struct ggml_tensor * Vcur;
8201
8224
 
8202
8225
  // self-attention
8203
- if (model.arch == LLM_ARCH_BERT) {
8226
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8204
8227
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8205
8228
  cb(Qcur, "Qcur", il);
8206
8229
 
8230
+ if (model.layers[il].attn_q_norm) {
8231
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8232
+ model.layers[il].attn_q_norm,
8233
+ model.layers[il].attn_q_norm_b,
8234
+ LLM_NORM, cb, il);
8235
+ }
8236
+
8207
8237
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8208
8238
  cb(Kcur, "Kcur", il);
8209
8239
 
8240
+ if (model.layers[il].attn_k_norm) {
8241
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8242
+ model.layers[il].attn_k_norm,
8243
+ model.layers[il].attn_k_norm_b,
8244
+ LLM_NORM, cb, il);
8245
+ }
8210
8246
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8211
8247
  cb(Vcur, "Vcur", il);
8212
8248
 
@@ -8225,15 +8261,15 @@ struct llm_build_context {
8225
8261
  cb(Kcur, "Kcur", il);
8226
8262
  cb(Vcur, "Vcur", il);
8227
8263
 
8228
- Qcur = ggml_rope_custom(
8229
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8264
+ Qcur = ggml_rope_ext(
8265
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8230
8266
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8231
8267
  ext_factor, attn_factor, beta_fast, beta_slow
8232
8268
  );
8233
8269
  cb(Qcur, "Qcur", il);
8234
8270
 
8235
- Kcur = ggml_rope_custom(
8236
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8271
+ Kcur = ggml_rope_ext(
8272
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8237
8273
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8238
8274
  ext_factor, attn_factor, beta_fast, beta_slow
8239
8275
  );
@@ -8246,7 +8282,7 @@ struct llm_build_context {
8246
8282
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8247
8283
  cb(kq, "kq", il);
8248
8284
 
8249
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8285
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8250
8286
  cb(kq, "kq_soft_max_ext", il);
8251
8287
 
8252
8288
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8297,6 +8333,13 @@ struct llm_build_context {
8297
8333
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8298
8334
  NULL,
8299
8335
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8336
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8337
+ cur = llm_build_ffn(ctx0, cur,
8338
+ model.layers[il].ffn_up, NULL,
8339
+ model.layers[il].ffn_gate, NULL,
8340
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8341
+ NULL,
8342
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8300
8343
  } else {
8301
8344
  cur = llm_build_ffn(ctx0, cur,
8302
8345
  model.layers[il].ffn_up, NULL,
@@ -8363,9 +8406,6 @@ struct llm_build_context {
8363
8406
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8364
8407
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8365
8408
 
8366
- // positions of the tokens in the KV cache
8367
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8368
-
8369
8409
  inpL = llm_build_norm(ctx0, inpL, hparams,
8370
8410
  model.tok_norm,
8371
8411
  model.tok_norm_b,
@@ -8399,7 +8439,7 @@ struct llm_build_context {
8399
8439
 
8400
8440
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8401
8441
  model.layers[il].wo, model.layers[il].bo,
8402
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8442
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8403
8443
  }
8404
8444
 
8405
8445
  if (il == n_layer - 1) {
@@ -8464,9 +8504,6 @@ struct llm_build_context {
8464
8504
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8465
8505
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8466
8506
 
8467
- // positions of the tokens in the KV cache
8468
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8469
-
8470
8507
  if (model.pos_embd) {
8471
8508
  // inp_pos - contains the positions
8472
8509
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8530,13 +8567,13 @@ struct llm_build_context {
8530
8567
 
8531
8568
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
8569
  model.layers[il].wo, model.layers[il].bo,
8533
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8570
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8534
8571
  } else {
8535
8572
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8536
8573
 
8537
8574
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8538
8575
  model.layers[il].wo, model.layers[il].bo,
8539
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8576
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8540
8577
  }
8541
8578
  }
8542
8579
 
@@ -8664,15 +8701,15 @@ struct llm_build_context {
8664
8701
  }
8665
8702
 
8666
8703
 
8667
- Qcur = ggml_rope_custom(
8668
- ctx0, Qcur, inp_pos,
8704
+ Qcur = ggml_rope_ext(
8705
+ ctx0, Qcur, inp_pos, nullptr,
8669
8706
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8670
8707
  ext_factor, attn_factor, beta_fast, beta_slow
8671
8708
  );
8672
8709
  cb(Qcur, "Qcur", il);
8673
8710
 
8674
- Kcur = ggml_rope_custom(
8675
- ctx0, Kcur, inp_pos,
8711
+ Kcur = ggml_rope_ext(
8712
+ ctx0, Kcur, inp_pos, nullptr,
8676
8713
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8677
8714
  ext_factor, attn_factor, beta_fast, beta_slow
8678
8715
  );
@@ -8680,7 +8717,7 @@ struct llm_build_context {
8680
8717
 
8681
8718
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8682
8719
  model.layers[il].wo, NULL,
8683
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8720
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8684
8721
  }
8685
8722
 
8686
8723
  if (il == n_layer - 1) {
@@ -8784,21 +8821,21 @@ struct llm_build_context {
8784
8821
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8785
8822
 
8786
8823
  // using mode = 2 for neox mode
8787
- Qcur = ggml_rope_custom(
8788
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8824
+ Qcur = ggml_rope_ext(
8825
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8789
8826
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8790
8827
  );
8791
8828
  cb(Qcur, "Qcur", il);
8792
8829
 
8793
- Kcur = ggml_rope_custom(
8794
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8830
+ Kcur = ggml_rope_ext(
8831
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
8795
8832
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8796
8833
  );
8797
8834
  cb(Kcur, "Kcur", il);
8798
8835
 
8799
8836
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8800
8837
  model.layers[il].wo, NULL,
8801
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8838
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8802
8839
  }
8803
8840
 
8804
8841
  if (il == n_layer - 1) {
@@ -8895,15 +8932,15 @@ struct llm_build_context {
8895
8932
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8896
8933
  cb(Vcur, "Vcur", il);
8897
8934
 
8898
- Qcur = ggml_rope_custom(
8899
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8935
+ Qcur = ggml_rope_ext(
8936
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8900
8937
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8901
8938
  ext_factor, attn_factor, beta_fast, beta_slow
8902
8939
  );
8903
8940
  cb(Qcur, "Qcur", il);
8904
8941
 
8905
- Kcur = ggml_rope_custom(
8906
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8942
+ Kcur = ggml_rope_ext(
8943
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8907
8944
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8908
8945
  ext_factor, attn_factor, beta_fast, beta_slow
8909
8946
  );
@@ -8911,7 +8948,7 @@ struct llm_build_context {
8911
8948
 
8912
8949
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8913
8950
  model.layers[il].wo, model.layers[il].bo,
8914
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8951
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8915
8952
  }
8916
8953
 
8917
8954
  if (il == n_layer - 1) {
@@ -9009,15 +9046,15 @@ struct llm_build_context {
9009
9046
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9010
9047
  cb(Vcur, "Vcur", il);
9011
9048
 
9012
- Qcur = ggml_rope_custom(
9013
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9049
+ Qcur = ggml_rope_ext(
9050
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9014
9051
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9015
9052
  ext_factor, attn_factor, beta_fast, beta_slow
9016
9053
  );
9017
9054
  cb(Qcur, "Qcur", il);
9018
9055
 
9019
- Kcur = ggml_rope_custom(
9020
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9056
+ Kcur = ggml_rope_ext(
9057
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9021
9058
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9022
9059
  ext_factor, attn_factor, beta_fast, beta_slow
9023
9060
  );
@@ -9025,7 +9062,7 @@ struct llm_build_context {
9025
9062
 
9026
9063
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9027
9064
  model.layers[il].wo, model.layers[il].bo,
9028
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9065
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9029
9066
  }
9030
9067
 
9031
9068
  if (il == n_layer - 1) {
@@ -9161,8 +9198,8 @@ struct llm_build_context {
9161
9198
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9162
9199
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9163
9200
 
9164
- Qcur = ggml_rope_custom(
9165
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9201
+ Qcur = ggml_rope_ext(
9202
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9166
9203
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9167
9204
  );
9168
9205
  cb(Qcur, "Qcur", il);
@@ -9172,15 +9209,15 @@ struct llm_build_context {
9172
9209
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9173
9210
  cb(Qcur, "Qcur", il);
9174
9211
 
9175
- Kcur = ggml_rope_custom(
9176
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9212
+ Kcur = ggml_rope_ext(
9213
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9177
9214
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9178
9215
  );
9179
9216
  cb(Kcur, "Kcur", il);
9180
9217
 
9181
9218
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9182
9219
  model.layers[il].wo, model.layers[il].bo,
9183
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9220
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9184
9221
  }
9185
9222
 
9186
9223
  if (il == n_layer - 1) {
@@ -9249,6 +9286,9 @@ struct llm_build_context {
9249
9286
 
9250
9287
  // self-attention
9251
9288
  {
9289
+ // rope freq factors for 128k context
9290
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
9291
+
9252
9292
  struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9253
9293
  model.layers[il].attn_norm,
9254
9294
  NULL,
@@ -9280,8 +9320,8 @@ struct llm_build_context {
9280
9320
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9281
9321
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9282
9322
 
9283
- Qcur = ggml_rope_custom(
9284
- ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9323
+ Qcur = ggml_rope_ext(
9324
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9285
9325
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9286
9326
  );
9287
9327
  cb(Qcur, "Qcur", il);
@@ -9289,15 +9329,15 @@ struct llm_build_context {
9289
9329
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9290
9330
  cb(Qcur, "Qcur", il);
9291
9331
 
9292
- Kcur = ggml_rope_custom(
9293
- ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9332
+ Kcur = ggml_rope_ext(
9333
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9294
9334
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9295
9335
  );
9296
9336
  cb(Kcur, "Kcur", il);
9297
9337
 
9298
9338
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
9339
  model.layers[il].wo, model.layers[il].bo,
9300
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9340
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
9341
  }
9302
9342
 
9303
9343
  if (il == n_layer - 1) {
@@ -9396,21 +9436,21 @@ struct llm_build_context {
9396
9436
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9397
9437
  cb(Vcur, "Vcur", il);
9398
9438
 
9399
- Qcur = ggml_rope_custom(
9400
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
9439
+ Qcur = ggml_rope_ext(
9440
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9401
9441
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9402
9442
  ext_factor, attn_factor, beta_fast, beta_slow);
9403
9443
  cb(Qcur, "Qcur", il);
9404
9444
 
9405
- Kcur = ggml_rope_custom(
9406
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
9445
+ Kcur = ggml_rope_ext(
9446
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9407
9447
  n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9408
9448
  ext_factor, attn_factor, beta_fast, beta_slow);
9409
9449
  cb(Kcur, "Kcur", il);
9410
9450
 
9411
9451
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9412
9452
  model.layers[il].wo, NULL,
9413
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9453
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9414
9454
  }
9415
9455
  struct ggml_tensor * sa_out = cur;
9416
9456
 
@@ -9513,7 +9553,7 @@ struct llm_build_context {
9513
9553
 
9514
9554
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9515
9555
  model.layers[il].wo, model.layers[il].bo,
9516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9556
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9517
9557
  }
9518
9558
 
9519
9559
  if (il == n_layer - 1) {
@@ -9604,15 +9644,15 @@ struct llm_build_context {
9604
9644
  cb(tmpk, "tmpk", il);
9605
9645
  cb(Vcur, "Vcur", il);
9606
9646
 
9607
- struct ggml_tensor * Qcur = ggml_rope_custom(
9608
- ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
9647
+ struct ggml_tensor * Qcur = ggml_rope_ext(
9648
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9609
9649
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9610
9650
  ext_factor, attn_factor, beta_fast, beta_slow
9611
9651
  );
9612
9652
  cb(Qcur, "Qcur", il);
9613
9653
 
9614
- struct ggml_tensor * Kcur = ggml_rope_custom(
9615
- ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
9654
+ struct ggml_tensor * Kcur = ggml_rope_ext(
9655
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9616
9656
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9617
9657
  ext_factor, attn_factor, beta_fast, beta_slow
9618
9658
  );
@@ -9620,7 +9660,7 @@ struct llm_build_context {
9620
9660
 
9621
9661
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9622
9662
  model.layers[il].wo, model.layers[il].bo,
9623
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9663
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9624
9664
  }
9625
9665
 
9626
9666
  if (il == n_layer - 1) {
@@ -9720,15 +9760,15 @@ struct llm_build_context {
9720
9760
  // cb(Vcur, "Vcur", il);
9721
9761
  // }
9722
9762
 
9723
- Qcur = ggml_rope_custom(
9724
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9763
+ Qcur = ggml_rope_ext(
9764
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9725
9765
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9726
9766
  ext_factor, attn_factor, beta_fast, beta_slow
9727
9767
  );
9728
9768
  cb(Qcur, "Qcur", il);
9729
9769
 
9730
- Kcur = ggml_rope_custom(
9731
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9770
+ Kcur = ggml_rope_ext(
9771
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9732
9772
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9733
9773
  ext_factor, attn_factor, beta_fast, beta_slow
9734
9774
  );
@@ -9736,7 +9776,7 @@ struct llm_build_context {
9736
9776
 
9737
9777
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9738
9778
  model.layers[il].wo, NULL,
9739
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9779
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9740
9780
  }
9741
9781
 
9742
9782
  if (il == n_layer - 1) {
@@ -9837,15 +9877,15 @@ struct llm_build_context {
9837
9877
  cb(Vcur, "Vcur", il);
9838
9878
  }
9839
9879
 
9840
- Qcur = ggml_rope_custom(
9841
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9880
+ Qcur = ggml_rope_ext(
9881
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9842
9882
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9843
9883
  ext_factor, attn_factor, beta_fast, beta_slow
9844
9884
  );
9845
9885
  cb(Qcur, "Qcur", il);
9846
9886
 
9847
- Kcur = ggml_rope_custom(
9848
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9887
+ Kcur = ggml_rope_ext(
9888
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9849
9889
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9850
9890
  ext_factor, attn_factor, beta_fast, beta_slow
9851
9891
  );
@@ -9853,7 +9893,7 @@ struct llm_build_context {
9853
9893
 
9854
9894
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9855
9895
  model.layers[il].wo, model.layers[il].bo,
9856
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9896
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9857
9897
  }
9858
9898
 
9859
9899
  if (il == n_layer - 1) {
@@ -9967,15 +10007,15 @@ struct llm_build_context {
9967
10007
  cb(Vcur, "Vcur", il);
9968
10008
  }
9969
10009
 
9970
- Qcur = ggml_rope_custom(
9971
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10010
+ Qcur = ggml_rope_ext(
10011
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9972
10012
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9973
10013
  ext_factor, attn_factor, beta_fast, beta_slow
9974
10014
  );
9975
10015
  cb(Qcur, "Qcur", il);
9976
10016
 
9977
- Kcur = ggml_rope_custom(
9978
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10017
+ Kcur = ggml_rope_ext(
10018
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9979
10019
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9980
10020
  ext_factor, attn_factor, beta_fast, beta_slow
9981
10021
  );
@@ -9983,7 +10023,7 @@ struct llm_build_context {
9983
10023
 
9984
10024
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9985
10025
  model.layers[il].wo, model.layers[il].bo,
9986
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10026
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9987
10027
  }
9988
10028
 
9989
10029
  if (il == n_layer - 1) {
@@ -10087,8 +10127,8 @@ struct llm_build_context {
10087
10127
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10088
10128
  cb(Vcur, "Vcur", il);
10089
10129
 
10090
- Qcur = ggml_rope_custom(
10091
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
10130
+ Qcur = ggml_rope_ext(
10131
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10092
10132
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10093
10133
  ext_factor, attn_factor, beta_fast, beta_slow);
10094
10134
  cb(Qcur, "Qcur", il);
@@ -10096,15 +10136,15 @@ struct llm_build_context {
10096
10136
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
10097
10137
  cb(Qcur, "Qcur_scaled", il);
10098
10138
 
10099
- Kcur = ggml_rope_custom(
10100
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
10139
+ Kcur = ggml_rope_ext(
10140
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10101
10141
  n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10102
10142
  ext_factor, attn_factor, beta_fast, beta_slow);
10103
10143
  cb(Kcur, "Kcur", il);
10104
10144
 
10105
10145
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10106
10146
  model.layers[il].wo, NULL,
10107
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10147
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10108
10148
  }
10109
10149
 
10110
10150
  if (il == n_layer - 1) {
@@ -10207,15 +10247,15 @@ struct llm_build_context {
10207
10247
  cb(Vcur, "Vcur", il);
10208
10248
  }
10209
10249
 
10210
- Qcur = ggml_rope_custom(
10211
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10250
+ Qcur = ggml_rope_ext(
10251
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10212
10252
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10213
10253
  ext_factor, attn_factor, beta_fast, beta_slow
10214
10254
  );
10215
10255
  cb(Qcur, "Qcur", il);
10216
10256
 
10217
- Kcur = ggml_rope_custom(
10218
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10257
+ Kcur = ggml_rope_ext(
10258
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10219
10259
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10220
10260
  ext_factor, attn_factor, beta_fast, beta_slow
10221
10261
  );
@@ -10223,7 +10263,7 @@ struct llm_build_context {
10223
10263
 
10224
10264
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10225
10265
  model.layers[il].wo, model.layers[il].bo,
10226
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10266
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10227
10267
  }
10228
10268
 
10229
10269
  if (il == n_layer - 1) {
@@ -10490,22 +10530,267 @@ struct llm_build_context {
10490
10530
  LLM_NORM, cb, il);
10491
10531
  cb(Qcur, "Qcur", il);
10492
10532
 
10493
- Kcur = llm_build_norm(ctx0, Kcur, hparams,
10494
- model.layers[il].attn_k_norm,
10495
- NULL,
10496
- LLM_NORM, cb, il);
10497
- cb(Kcur, "Kcur", il);
10498
- }
10533
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
10534
+ model.layers[il].attn_k_norm,
10535
+ NULL,
10536
+ LLM_NORM, cb, il);
10537
+ cb(Kcur, "Kcur", il);
10538
+ }
10539
+
10540
+ Qcur = ggml_rope_ext(
10541
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10542
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10543
+ ext_factor, attn_factor, beta_fast, beta_slow
10544
+ );
10545
+ cb(Qcur, "Qcur", il);
10546
+
10547
+ Kcur = ggml_rope_ext(
10548
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10549
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10550
+ ext_factor, attn_factor, beta_fast, beta_slow
10551
+ );
10552
+ cb(Kcur, "Kcur", il);
10553
+
10554
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10555
+ model.layers[il].wo, model.layers[il].bo,
10556
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10557
+ }
10558
+
10559
+ if (il == n_layer - 1) {
10560
+ // skip computing output for unused tokens
10561
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10562
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10563
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10564
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
10565
+ }
10566
+
10567
+ struct ggml_tensor * attn_out = cur;
10568
+
10569
+ // feed-forward network
10570
+ {
10571
+ cur = llm_build_ffn(ctx0, ffn_inp,
10572
+ model.layers[il].ffn_up, NULL,
10573
+ model.layers[il].ffn_gate, NULL,
10574
+ model.layers[il].ffn_down, NULL,
10575
+ NULL,
10576
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10577
+ cb(cur, "ffn_out", il);
10578
+ }
10579
+
10580
+ // add together residual + FFN + self-attention
10581
+ cur = ggml_add(ctx0, cur, inpL);
10582
+ cur = ggml_add(ctx0, cur, attn_out);
10583
+ cb(cur, "l_out", il);
10584
+
10585
+ // input for next layer
10586
+ inpL = cur;
10587
+ }
10588
+
10589
+ cur = inpL;
10590
+
10591
+ cur = llm_build_norm(ctx0, cur, hparams,
10592
+ model.output_norm, NULL,
10593
+ LLM_NORM, cb, -1);
10594
+ cb(cur, "result_norm", -1);
10595
+
10596
+ // lm_head
10597
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10598
+
10599
+ if (f_logit_scale) {
10600
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
10601
+ }
10602
+
10603
+ cb(cur, "result_output", -1);
10604
+
10605
+ ggml_build_forward_expand(gf, cur);
10606
+
10607
+ return gf;
10608
+
10609
+ }
10610
+
10611
+ // ref: https://allenai.org/olmo
10612
+ // based on the original build_llama() function, changes:
10613
+ // * non-parametric layer norm
10614
+ // * clamp qkv
10615
+ // * removed bias
10616
+ // * removed MoE
10617
+ struct ggml_cgraph * build_olmo() {
10618
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10619
+
10620
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10621
+ int32_t n_tokens = this->n_tokens;
10622
+
10623
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10624
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10625
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10626
+
10627
+ struct ggml_tensor * cur;
10628
+ struct ggml_tensor * inpL;
10629
+
10630
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10631
+
10632
+ // inp_pos - contains the positions
10633
+ struct ggml_tensor * inp_pos = build_inp_pos();
10634
+
10635
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10636
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10637
+
10638
+ for (int il = 0; il < n_layer; ++il) {
10639
+ struct ggml_tensor * inpSA = inpL;
10640
+
10641
+ // norm
10642
+ cur = llm_build_norm(ctx0, inpL, hparams,
10643
+ NULL, NULL,
10644
+ LLM_NORM, cb, il);
10645
+ cb(cur, "attn_norm", il);
10646
+
10647
+ // self-attention
10648
+ {
10649
+ // compute Q and K and RoPE them
10650
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10651
+ cb(Qcur, "Qcur", il);
10652
+ if (hparams.f_clamp_kqv > 0.0f) {
10653
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10654
+ cb(Qcur, "Qcur", il);
10655
+ }
10656
+
10657
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10658
+ cb(Kcur, "Kcur", il);
10659
+ if (hparams.f_clamp_kqv > 0.0f) {
10660
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10661
+ cb(Kcur, "Kcur", il);
10662
+ }
10663
+
10664
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10665
+ cb(Vcur, "Vcur", il);
10666
+ if (hparams.f_clamp_kqv > 0.0f) {
10667
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10668
+ cb(Vcur, "Vcur", il);
10669
+ }
10670
+
10671
+ Qcur = ggml_rope_ext(
10672
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10673
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10674
+ ext_factor, attn_factor, beta_fast, beta_slow
10675
+ );
10676
+ cb(Qcur, "Qcur", il);
10677
+
10678
+ Kcur = ggml_rope_ext(
10679
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10680
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10681
+ ext_factor, attn_factor, beta_fast, beta_slow
10682
+ );
10683
+ cb(Kcur, "Kcur", il);
10684
+
10685
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10686
+ model.layers[il].wo, nullptr,
10687
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10688
+ }
10689
+
10690
+ if (il == n_layer - 1) {
10691
+ // skip computing output for unused tokens
10692
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10693
+ n_tokens = n_outputs;
10694
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10695
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10696
+ }
10697
+
10698
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10699
+ cb(ffn_inp, "ffn_inp", il);
10700
+
10701
+ // feed-forward network
10702
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10703
+ NULL, NULL,
10704
+ LLM_NORM, cb, il);
10705
+ cb(cur, "ffn_norm", il);
10706
+
10707
+ cur = llm_build_ffn(ctx0, cur,
10708
+ model.layers[il].ffn_up, NULL,
10709
+ model.layers[il].ffn_gate, NULL,
10710
+ model.layers[il].ffn_down, NULL,
10711
+ NULL,
10712
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10713
+ cb(cur, "ffn_out", il);
10714
+
10715
+ cur = ggml_add(ctx0, cur, ffn_inp);
10716
+ cb(cur, "ffn_out", il);
10717
+
10718
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10719
+ if (layer_dir != nullptr) {
10720
+ cur = ggml_add(ctx0, cur, layer_dir);
10721
+ }
10722
+ cb(cur, "l_out", il);
10723
+
10724
+ // input for next layer
10725
+ inpL = cur;
10726
+ }
10727
+
10728
+ cur = inpL;
10729
+
10730
+ cur = llm_build_norm(ctx0, cur, hparams,
10731
+ NULL, NULL,
10732
+ LLM_NORM, cb, -1);
10733
+ cb(cur, "result_norm", -1);
10734
+
10735
+ // lm_head
10736
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10737
+ cb(cur, "result_output", -1);
10738
+
10739
+ ggml_build_forward_expand(gf, cur);
10740
+
10741
+ return gf;
10742
+ }
10743
+
10744
+ struct ggml_cgraph * build_gptneox() {
10745
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10746
+
10747
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10748
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
10749
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10750
+
10751
+ struct ggml_tensor * cur;
10752
+ struct ggml_tensor * inpL;
10753
+
10754
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10755
+
10756
+ // inp_pos - contains the positions
10757
+ struct ggml_tensor * inp_pos = build_inp_pos();
10758
+
10759
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10760
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10761
+
10762
+ for (int il = 0; il < n_layer; ++il) {
10763
+ cur = llm_build_norm(ctx0, inpL, hparams,
10764
+ model.layers[il].attn_norm,
10765
+ model.layers[il].attn_norm_b,
10766
+ LLM_NORM, cb, il);
10767
+ cb(cur, "attn_norm", il);
10768
+
10769
+ // self-attention
10770
+ {
10771
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10772
+ cb(cur, "wqkv", il);
10773
+
10774
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10775
+ cb(cur, "bqkv", il);
10776
+
10777
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10778
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10779
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10780
+
10781
+ cb(Qcur, "Qcur", il);
10782
+ cb(Kcur, "Kcur", il);
10783
+ cb(Vcur, "Vcur", il);
10499
10784
 
10500
- Qcur = ggml_rope_custom(
10501
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10785
+ Qcur = ggml_rope_ext(
10786
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10502
10787
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10503
10788
  ext_factor, attn_factor, beta_fast, beta_slow
10504
10789
  );
10505
10790
  cb(Qcur, "Qcur", il);
10506
10791
 
10507
- Kcur = ggml_rope_custom(
10508
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10792
+ Kcur = ggml_rope_ext(
10793
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10509
10794
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10510
10795
  ext_factor, attn_factor, beta_fast, beta_slow
10511
10796
  );
@@ -10513,68 +10798,84 @@ struct llm_build_context {
10513
10798
 
10514
10799
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10515
10800
  model.layers[il].wo, model.layers[il].bo,
10516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10801
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10517
10802
  }
10518
10803
 
10519
10804
  if (il == n_layer - 1) {
10520
10805
  // skip computing output for unused tokens
10521
10806
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10522
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10523
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10524
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
10807
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10808
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10525
10809
  }
10526
10810
 
10527
- struct ggml_tensor * attn_out = cur;
10811
+ // ffn
10812
+ if (hparams.use_par_res) {
10813
+ // attention and ffn are computed in parallel
10814
+ // x = x + attn(ln1(x)) + ffn(ln2(x))
10528
10815
 
10529
- // feed-forward network
10530
- {
10531
- cur = llm_build_ffn(ctx0, ffn_inp,
10532
- model.layers[il].ffn_up, NULL,
10533
- model.layers[il].ffn_gate, NULL,
10534
- model.layers[il].ffn_down, NULL,
10816
+ struct ggml_tensor * attn_out = cur;
10817
+
10818
+ cur = llm_build_norm(ctx0, inpL, hparams,
10819
+ model.layers[il].ffn_norm,
10820
+ model.layers[il].ffn_norm_b,
10821
+ LLM_NORM, cb, il);
10822
+ cb(cur, "ffn_norm", il);
10823
+
10824
+ cur = llm_build_ffn(ctx0, cur,
10825
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10826
+ NULL, NULL,
10827
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10535
10828
  NULL,
10536
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10829
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10537
10830
  cb(cur, "ffn_out", il);
10538
- }
10539
10831
 
10540
- // add together residual + FFN + self-attention
10541
- cur = ggml_add(ctx0, cur, inpL);
10542
- cur = ggml_add(ctx0, cur, attn_out);
10543
- cb(cur, "l_out", il);
10832
+ cur = ggml_add(ctx0, cur, inpL);
10833
+ cb(cur, "ffn_out", il);
10544
10834
 
10545
- // input for next layer
10546
- inpL = cur;
10547
- }
10835
+ inpL = ggml_add(ctx0, cur, attn_out);
10836
+ cb(inpL, "l_out", il);
10837
+ } else {
10838
+ // attention and ffn are computed sequentially
10839
+ // x = x + attn(ln1(x))
10840
+ // x = x + ffn(ln2(x))
10548
10841
 
10549
- cur = inpL;
10842
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
10843
+ cb(ffn_inp, "ffn_inp", il);
10550
10844
 
10551
- cur = llm_build_norm(ctx0, cur, hparams,
10552
- model.output_norm, NULL,
10553
- LLM_NORM, cb, -1);
10554
- cb(cur, "result_norm", -1);
10845
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10846
+ model.layers[il].ffn_norm,
10847
+ model.layers[il].ffn_norm_b,
10848
+ LLM_NORM, cb, il);
10849
+ cb(cur, "ffn_norm", il);
10555
10850
 
10556
- // lm_head
10557
- cur = ggml_mul_mat(ctx0, model.output, cur);
10851
+ cur = llm_build_ffn(ctx0, cur,
10852
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
10853
+ NULL, NULL,
10854
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
10855
+ NULL,
10856
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
10857
+ cb(cur, "ffn_out", il);
10558
10858
 
10559
- if (f_logit_scale) {
10560
- cur = ggml_scale(ctx0, cur, f_logit_scale);
10859
+ inpL = ggml_add(ctx0, cur, ffn_inp);
10860
+ cb(inpL, "l_out", il);
10861
+ }
10561
10862
  }
10562
10863
 
10864
+ cur = llm_build_norm(ctx0, inpL, hparams,
10865
+ model.output_norm,
10866
+ model.output_norm_b,
10867
+ LLM_NORM, cb, -1);
10868
+ cb(cur, "result_norm", -1);
10869
+
10870
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10563
10871
  cb(cur, "result_output", -1);
10564
10872
 
10565
10873
  ggml_build_forward_expand(gf, cur);
10566
10874
 
10567
10875
  return gf;
10568
-
10569
10876
  }
10570
10877
 
10571
- // ref: https://allenai.org/olmo
10572
- // based on the original build_llama() function, changes:
10573
- // * non-parametric layer norm
10574
- // * clamp qkv
10575
- // * removed bias
10576
- // * removed MoE
10577
- struct ggml_cgraph * build_olmo() {
10878
+ struct ggml_cgraph * build_arctic() {
10578
10879
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10579
10880
 
10580
10881
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -10600,8 +10901,8 @@ struct llm_build_context {
10600
10901
 
10601
10902
  // norm
10602
10903
  cur = llm_build_norm(ctx0, inpL, hparams,
10603
- NULL, NULL,
10604
- LLM_NORM, cb, il);
10904
+ model.layers[il].attn_norm, NULL,
10905
+ LLM_NORM_RMS, cb, il);
10605
10906
  cb(cur, "attn_norm", il);
10606
10907
 
10607
10908
  // self-attention
@@ -10609,42 +10910,30 @@ struct llm_build_context {
10609
10910
  // compute Q and K and RoPE them
10610
10911
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10611
10912
  cb(Qcur, "Qcur", il);
10612
- if (hparams.f_clamp_kqv > 0.0f) {
10613
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10614
- cb(Qcur, "Qcur", il);
10615
- }
10616
10913
 
10617
10914
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10618
10915
  cb(Kcur, "Kcur", il);
10619
- if (hparams.f_clamp_kqv > 0.0f) {
10620
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10621
- cb(Kcur, "Kcur", il);
10622
- }
10623
10916
 
10624
10917
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10625
10918
  cb(Vcur, "Vcur", il);
10626
- if (hparams.f_clamp_kqv > 0.0f) {
10627
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10628
- cb(Vcur, "Vcur", il);
10629
- }
10630
10919
 
10631
- Qcur = ggml_rope_custom(
10632
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10920
+ Qcur = ggml_rope_ext(
10921
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10633
10922
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10634
10923
  ext_factor, attn_factor, beta_fast, beta_slow
10635
10924
  );
10636
10925
  cb(Qcur, "Qcur", il);
10637
10926
 
10638
- Kcur = ggml_rope_custom(
10639
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10927
+ Kcur = ggml_rope_ext(
10928
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10640
10929
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10641
10930
  ext_factor, attn_factor, beta_fast, beta_slow
10642
10931
  );
10643
10932
  cb(Kcur, "Kcur", il);
10644
10933
 
10645
10934
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10646
- model.layers[il].wo, nullptr,
10647
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10935
+ model.layers[il].wo, NULL,
10936
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10648
10937
  }
10649
10938
 
10650
10939
  if (il == n_layer - 1) {
@@ -10660,8 +10949,8 @@ struct llm_build_context {
10660
10949
 
10661
10950
  // feed-forward network
10662
10951
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
10663
- NULL, NULL,
10664
- LLM_NORM, cb, il);
10952
+ model.layers[il].ffn_norm, NULL,
10953
+ LLM_NORM_RMS, cb, il);
10665
10954
  cb(cur, "ffn_norm", il);
10666
10955
 
10667
10956
  cur = llm_build_ffn(ctx0, cur,
@@ -10672,7 +10961,26 @@ struct llm_build_context {
10672
10961
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10673
10962
  cb(cur, "ffn_out", il);
10674
10963
 
10675
- cur = ggml_add(ctx0, cur, ffn_inp);
10964
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
10965
+ cb(ffn_out, "ffn_out", il);
10966
+
10967
+ // MoE
10968
+ cur = llm_build_norm(ctx0, inpSA, hparams,
10969
+ model.layers[il].ffn_norm_exps, NULL,
10970
+ LLM_NORM_RMS, cb, il);
10971
+ cb(cur, "ffn_norm_exps", il);
10972
+
10973
+ cur = llm_build_moe_ffn(ctx0, cur,
10974
+ model.layers[il].ffn_gate_inp,
10975
+ model.layers[il].ffn_up_exps,
10976
+ model.layers[il].ffn_gate_exps,
10977
+ model.layers[il].ffn_down_exps,
10978
+ n_expert, n_expert_used,
10979
+ LLM_FFN_SILU, true,
10980
+ cb, il);
10981
+ cb(cur, "ffn_moe_out", il);
10982
+
10983
+ cur = ggml_add(ctx0, cur, ffn_out);
10676
10984
  cb(cur, "ffn_out", il);
10677
10985
 
10678
10986
  ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
@@ -10688,8 +10996,8 @@ struct llm_build_context {
10688
10996
  cur = inpL;
10689
10997
 
10690
10998
  cur = llm_build_norm(ctx0, cur, hparams,
10691
- NULL, NULL,
10692
- LLM_NORM, cb, -1);
10999
+ model.output_norm, NULL,
11000
+ LLM_NORM_RMS, cb, -1);
10693
11001
  cb(cur, "result_norm", -1);
10694
11002
 
10695
11003
  // lm_head
@@ -10816,15 +11124,12 @@ static struct ggml_cgraph * llama_build_graph(
10816
11124
  {
10817
11125
  result = llm.build_starcoder();
10818
11126
  } break;
10819
- case LLM_ARCH_PERSIMMON:
10820
- {
10821
- result = llm.build_persimmon();
10822
- } break;
10823
11127
  case LLM_ARCH_REFACT:
10824
11128
  {
10825
11129
  result = llm.build_refact();
10826
11130
  } break;
10827
11131
  case LLM_ARCH_BERT:
11132
+ case LLM_ARCH_JINA_BERT_V2:
10828
11133
  case LLM_ARCH_NOMIC_BERT:
10829
11134
  {
10830
11135
  result = llm.build_bert();
@@ -10913,6 +11218,14 @@ static struct ggml_cgraph * llama_build_graph(
10913
11218
  {
10914
11219
  result = llm.build_olmo();
10915
11220
  } break;
11221
+ case LLM_ARCH_GPTNEOX:
11222
+ {
11223
+ result = llm.build_gptneox();
11224
+ } break;
11225
+ case LLM_ARCH_ARCTIC:
11226
+ {
11227
+ result = llm.build_arctic();
11228
+ } break;
10916
11229
  default:
10917
11230
  GGML_ASSERT(false);
10918
11231
  }
@@ -11032,11 +11345,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11032
11345
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11033
11346
  f = -INFINITY;
11034
11347
  } else {
11035
- f = 0.0f;
11348
+ if (hparams.use_alibi) {
11349
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11350
+ } else {
11351
+ f = 0.0f;
11352
+ }
11036
11353
  }
11037
11354
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11038
11355
  }
11039
11356
  }
11357
+
11358
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11359
+ for (int j = 0; j < n_kv; ++j) {
11360
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11361
+ }
11362
+ }
11040
11363
  }
11041
11364
  } else {
11042
11365
  // when using kv cache, the mask needs to match the kv cache size
@@ -11055,7 +11378,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11055
11378
  float f = -INFINITY;
11056
11379
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11057
11380
  if (batch.seq_id[i][s] == seq_id) {
11058
- f = 0.0f;
11381
+ if (hparams.use_alibi) {
11382
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11383
+ } else {
11384
+ f = 0.0f;
11385
+ }
11059
11386
  break;
11060
11387
  }
11061
11388
  }
@@ -11071,21 +11398,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11071
11398
  }
11072
11399
  }
11073
11400
 
11074
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
- // this allows to process multiple sequences in parallel with ALiBi-based models
11076
- if (hparams.use_alibi) {
11077
- const int64_t n_kv = kv_self.n;
11078
-
11079
- GGML_ASSERT(lctx.inp_KQ_pos);
11080
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11081
-
11082
- float * data = (float *) lctx.inp_KQ_pos->data;
11083
-
11084
- for (int i = 0; i < n_kv; ++i) {
11085
- data[i] = float(lctx.kv_self.cells[i].pos);
11086
- }
11087
- }
11088
-
11089
11401
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11090
11402
  const int64_t n_tokens = batch.n_tokens;
11091
11403
 
@@ -11259,11 +11571,6 @@ static void llama_graph_compute(
11259
11571
  llama_context & lctx,
11260
11572
  ggml_cgraph * gf,
11261
11573
  int n_threads) {
11262
- #ifdef GGML_USE_MPI
11263
- const int64_t n_layer = lctx.model.hparams.n_layer;
11264
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
11265
- #endif
11266
-
11267
11574
  #ifdef GGML_USE_METAL
11268
11575
  if (ggml_backend_is_metal(lctx.backend_metal)) {
11269
11576
  ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -11278,10 +11585,6 @@ static void llama_graph_compute(
11278
11585
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
11279
11586
 
11280
11587
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
11281
-
11282
- #ifdef GGML_USE_MPI
11283
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
11284
- #endif
11285
11588
  }
11286
11589
 
11287
11590
  // decode a batch of tokens by evaluating the transformer
@@ -11319,12 +11622,6 @@ static int llama_decode_internal(
11319
11622
  }
11320
11623
  lctx.n_queued_tokens += n_tokens_all;
11321
11624
 
11322
- #ifdef GGML_USE_MPI
11323
- // TODO: needs fix after #3228
11324
- GGML_ASSERT(false && "not implemented");
11325
- //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
11326
- #endif
11327
-
11328
11625
  auto & kv_self = lctx.kv_self;
11329
11626
 
11330
11627
  const int64_t n_embd = hparams.n_embd;
@@ -11455,7 +11752,8 @@ static int llama_decode_internal(
11455
11752
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11456
11753
  // after enough generations, the benefit from this heuristic disappears
11457
11754
  // if we start defragmenting the cache, the benefit from this will be more important
11458
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11755
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11756
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11459
11757
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11460
11758
  }
11461
11759
  }
@@ -12200,13 +12498,14 @@ struct llm_tokenizer_bpe {
12200
12498
 
12201
12499
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12202
12500
  int final_prev_index = -1;
12501
+ bool ignore_merges = false;
12203
12502
 
12204
12503
  std::vector<std::string> word_collection;
12205
12504
  switch (vocab.type) {
12206
12505
  case LLAMA_VOCAB_TYPE_BPE:
12207
12506
  switch (vocab.type_pre) {
12208
12507
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12508
+ ignore_merges = true;
12210
12509
  word_collection = unicode_regex_split(text, {
12211
12510
  // original regex from tokenizer.json
12212
12511
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12215,6 +12514,13 @@ struct llm_tokenizer_bpe {
12215
12514
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
12515
  });
12217
12516
  break;
12517
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12518
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12519
+ word_collection = unicode_regex_split(text, {
12520
+ // same as llama3
12521
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12522
+ });
12523
+ break;
12218
12524
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
12525
  word_collection = unicode_regex_split(text, {
12220
12526
  "[\r\n]",
@@ -12266,6 +12572,7 @@ struct llm_tokenizer_bpe {
12266
12572
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12267
12573
  });
12268
12574
  break;
12575
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
12269
12576
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
12577
  word_collection = unicode_regex_split(text, {
12271
12578
  // original regex from tokenizer.json
@@ -12298,6 +12605,11 @@ struct llm_tokenizer_bpe {
12298
12605
  int index = 0;
12299
12606
  size_t offset = 0;
12300
12607
 
12608
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12609
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12610
+ offset = word.size();
12611
+ }
12612
+
12301
12613
  while (offset < word.size()) {
12302
12614
  llm_symbol sym;
12303
12615
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12483,16 +12795,16 @@ struct llm_tokenizer_wpm {
12483
12795
  // to lowercase, pad chinese characters, pad punctuation
12484
12796
  std::string new_str = "";
12485
12797
  for (uint32_t code : cpts_nfd) {
12486
- int type = unicode_cpt_type(code);
12487
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12798
+ const codepoint_flags flags = unicode_cpt_flags(code);
12799
+ if (flags.is_accent_mark || flags.is_control) {
12488
12800
  continue;
12489
12801
  }
12490
12802
  code = unicode_tolower(code);
12491
- if (type == CODEPOINT_TYPE_SEPARATOR) {
12803
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12492
12804
  code = ' ';
12493
12805
  }
12494
12806
  std::string s = unicode_cpt_to_utf8(code);
12495
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12807
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12496
12808
  new_str += " ";
12497
12809
  new_str += s;
12498
12810
  new_str += " ";
@@ -12695,9 +13007,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12695
13007
  // tokenizer.encode('', add_special_tokens=True) returns [1]
12696
13008
  // tokenizer.encode('', add_special_tokens=False) returns []
12697
13009
 
13010
+ static const bool rtrim = true; //TODO: as param
13011
+ bool is_prev_special = false;
13012
+ bool special_token_rtrim = false;
13013
+
12698
13014
  if (add_special && vocab.special_add_bos != 0) {
12699
13015
  GGML_ASSERT(vocab.special_bos_id != -1);
12700
13016
  output.push_back(vocab.special_bos_id);
13017
+ is_prev_special = true;
12701
13018
  }
12702
13019
 
12703
13020
  for (const auto & fragment : fragment_buffer) {
@@ -12709,9 +13026,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12709
13026
  // and passing 'add space prefix' as bool argument
12710
13027
  //
12711
13028
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12712
- if (&fragment == &fragment_buffer.front()) {
12713
- if (vocab.add_space_prefix) {
12714
- raw_text = " " + raw_text; // prefix with space if the first token is not special
13029
+
13030
+ if (special_token_rtrim) {
13031
+ size_t num_whitespaces = 0;
13032
+ while (isspace(raw_text[num_whitespaces])) {
13033
+ num_whitespaces++;
13034
+ }
13035
+ if (num_whitespaces == raw_text.size()) {
13036
+ continue; // skip if all whitespaces
13037
+ }
13038
+ raw_text = raw_text.substr(num_whitespaces);
13039
+ }
13040
+
13041
+ if (vocab.add_space_prefix) {
13042
+ if (!output.size() || is_prev_special) { // prefix with space if first token
13043
+ raw_text = " " + raw_text;
12715
13044
  }
12716
13045
  }
12717
13046
 
@@ -12723,9 +13052,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12723
13052
  tokenizer.tokenize(raw_text, output);
12724
13053
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12725
13054
  output.push_back(fragment.token);
13055
+ is_prev_special = true;
13056
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
13057
+ special_token_rtrim = rtrim
13058
+ && fragment.token != vocab.special_bos_id
13059
+ && fragment.token != vocab.special_unk_id
13060
+ && fragment.token != vocab.special_eos_id;
12726
13061
  }
12727
13062
  }
12728
13063
 
13064
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13065
+ LLAMA_LOG_WARN(
13066
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13067
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13068
+ "Are you sure this is what you want?\n", __FUNCTION__);
13069
+ }
13070
+
12729
13071
  if (add_special && vocab.special_add_eos == 1) {
12730
13072
  GGML_ASSERT(vocab.special_eos_id != -1);
12731
13073
  output.push_back(vocab.special_eos_id);
@@ -12752,7 +13094,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12752
13094
  }
12753
13095
  }
12754
13096
 
12755
- GGML_ASSERT(vocab.special_add_eos != 1);
13097
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13098
+ LLAMA_LOG_WARN(
13099
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13100
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13101
+ "Are you sure this is what you want?\n", __FUNCTION__);
13102
+ }
13103
+
13104
+ if (add_special && vocab.special_add_eos == 1) {
13105
+ GGML_ASSERT(vocab.special_add_eos != -1);
13106
+ output.push_back(vocab.special_eos_id);
13107
+ }
12756
13108
  } break;
12757
13109
  case LLAMA_VOCAB_TYPE_WPM:
12758
13110
  {
@@ -13106,6 +13458,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13106
13458
  return rejects;
13107
13459
  }
13108
13460
 
13461
+ static bool llama_grammar_detect_left_recursion(
13462
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13463
+ size_t rule_index,
13464
+ std::vector<bool> * rules_visited,
13465
+ std::vector<bool> * rules_in_progress,
13466
+ std::vector<bool> * rules_may_be_empty) {
13467
+ if ((*rules_in_progress)[rule_index]) {
13468
+ return true;
13469
+ }
13470
+
13471
+ (*rules_in_progress)[rule_index] = true;
13472
+
13473
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13474
+
13475
+ // First check if the rule might produce the empty string. This could be done combined with the second
13476
+ // step but it's more readable as two steps.
13477
+ bool at_rule_start = true;
13478
+ for (size_t i = 0; i < rule.size(); i++) {
13479
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13480
+ if (at_rule_start) {
13481
+ (*rules_may_be_empty)[rule_index] = true;
13482
+ break;
13483
+ }
13484
+ at_rule_start = true;
13485
+ } else {
13486
+ at_rule_start = false;
13487
+ }
13488
+ }
13489
+
13490
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13491
+ // be empty)
13492
+ bool recurse_into_nonterminal = true;
13493
+ for (size_t i = 0; i < rule.size(); i++) {
13494
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13495
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13496
+ return true;
13497
+ }
13498
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13499
+ recurse_into_nonterminal = false;
13500
+ }
13501
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13502
+ recurse_into_nonterminal = true;
13503
+ } else {
13504
+ recurse_into_nonterminal = false;
13505
+ }
13506
+ }
13507
+
13508
+ (*rules_in_progress)[rule_index] = false;
13509
+ (*rules_visited)[rule_index] = true;
13510
+ return false;
13511
+ }
13512
+
13109
13513
  //
13110
13514
  // grammar - external
13111
13515
  //
@@ -13125,6 +13529,19 @@ struct llama_grammar * llama_grammar_init(
13125
13529
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13126
13530
  }
13127
13531
 
13532
+ // Check for left recursion
13533
+ std::vector<bool> rules_visited(n_rules);
13534
+ std::vector<bool> rules_in_progress(n_rules);
13535
+ std::vector<bool> rules_may_be_empty(n_rules);
13536
+ for (size_t i = 0; i < n_rules; i++) {
13537
+ if (rules_visited[i]) {
13538
+ continue;
13539
+ }
13540
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13541
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13542
+ }
13543
+ }
13544
+
13128
13545
  // loop over alternates of start rule to build initial stacks
13129
13546
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13130
13547
  pos = vec_rules[start_rule_index].data();
@@ -13147,6 +13564,9 @@ struct llama_grammar * llama_grammar_init(
13147
13564
  }
13148
13565
  } while (true);
13149
13566
 
13567
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13568
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13569
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13150
13570
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13151
13571
  }
13152
13572
 
@@ -13741,9 +14161,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13741
14161
 
13742
14162
  // Sample the next word X using top-k sampling
13743
14163
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13744
- if (ctx) {
13745
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13746
- }
14164
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13747
14165
  llama_token X = llama_sample_token(ctx, candidates);
13748
14166
  t_start_sample_us = ggml_time_us();
13749
14167
 
@@ -13757,9 +14175,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13757
14175
  // Update mu using the learning rate and error
13758
14176
  *mu = *mu - eta * e;
13759
14177
 
13760
- if (ctx) {
13761
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13762
- }
14178
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13763
14179
  return X;
13764
14180
  }
13765
14181
 
@@ -14344,8 +14760,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14344
14760
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
14345
14761
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
14346
14762
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
14347
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
14348
- (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
14349
14763
  if (qs.model.type == MODEL_70B) {
14350
14764
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
14351
14765
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15246,6 +15660,7 @@ struct llama_model_params llama_model_default_params() {
15246
15660
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15247
15661
  /*.main_gpu =*/ 0,
15248
15662
  /*.tensor_split =*/ nullptr,
15663
+ /*.rpc_servers =*/ nullptr,
15249
15664
  /*.progress_callback =*/ nullptr,
15250
15665
  /*.progress_callback_user_data =*/ nullptr,
15251
15666
  /*.kv_overrides =*/ nullptr,
@@ -15316,7 +15731,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15316
15731
  }
15317
15732
 
15318
15733
  size_t llama_max_devices(void) {
15319
- #if defined(GGML_USE_METAL)
15734
+ #if defined(GGML_USE_RPC)
15735
+ return GGML_RPC_MAX_SERVERS;
15736
+ #elif defined(GGML_USE_METAL)
15320
15737
  return 1;
15321
15738
  #elif defined(GGML_USE_CUDA)
15322
15739
  return GGML_CUDA_MAX_DEVICES;
@@ -15339,7 +15756,7 @@ bool llama_supports_mlock(void) {
15339
15756
 
15340
15757
  bool llama_supports_gpu_offload(void) {
15341
15758
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15342
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15759
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15343
15760
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15344
15761
  return true;
15345
15762
  #else
@@ -15356,10 +15773,6 @@ void llama_backend_init(void) {
15356
15773
  struct ggml_context * ctx = ggml_init(params);
15357
15774
  ggml_free(ctx);
15358
15775
  }
15359
-
15360
- #ifdef GGML_USE_MPI
15361
- ggml_mpi_backend_init();
15362
- #endif
15363
15776
  }
15364
15777
 
15365
15778
  void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -15369,9 +15782,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
15369
15782
  }
15370
15783
 
15371
15784
  void llama_backend_free(void) {
15372
- #ifdef GGML_USE_MPI
15373
- ggml_mpi_backend_free();
15374
- #endif
15375
15785
  ggml_quantize_free();
15376
15786
  }
15377
15787
 
@@ -15402,7 +15812,17 @@ struct llama_model * llama_load_model_from_file(
15402
15812
  return true;
15403
15813
  };
15404
15814
  }
15405
-
15815
+ if (params.rpc_servers != nullptr) {
15816
+ // split the servers set them into model->rpc_servers
15817
+ std::string servers(params.rpc_servers);
15818
+ size_t pos = 0;
15819
+ while ((pos = servers.find(",")) != std::string::npos) {
15820
+ std::string server = servers.substr(0, pos);
15821
+ model->rpc_servers.push_back(server);
15822
+ servers.erase(0, pos + 1);
15823
+ }
15824
+ model->rpc_servers.push_back(servers);
15825
+ }
15406
15826
  int status = llama_model_load(path_model, *model, params);
15407
15827
  GGML_ASSERT(status <= 0);
15408
15828
  if (status < 0) {
@@ -15441,6 +15861,11 @@ struct llama_context * llama_new_context_with_model(
15441
15861
  return nullptr;
15442
15862
  }
15443
15863
 
15864
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15865
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15866
+ params.flash_attn = false;
15867
+ }
15868
+
15444
15869
  llama_context * ctx = new llama_context(*model);
15445
15870
 
15446
15871
  const auto & hparams = model->hparams;
@@ -15464,7 +15889,7 @@ struct llama_context * llama_new_context_with_model(
15464
15889
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15465
15890
 
15466
15891
  // this is necessary due to kv_self.n being padded later during inference
15467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15892
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15468
15893
 
15469
15894
  // with causal attention, the batch size is limited by the context size
15470
15895
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15499,6 +15924,7 @@ struct llama_context * llama_new_context_with_model(
15499
15924
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
15500
15925
  }
15501
15926
 
15927
+ cparams.yarn_attn_factor *= hparams.rope_attn_factor;
15502
15928
  cparams.causal_attn = hparams.causal_attn;
15503
15929
 
15504
15930
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -15509,16 +15935,6 @@ struct llama_context * llama_new_context_with_model(
15509
15935
  }
15510
15936
  }
15511
15937
 
15512
- if (cparams.flash_attn && hparams.use_alibi) {
15513
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
- cparams.flash_attn = false;
15515
- }
15516
-
15517
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
- cparams.flash_attn = false;
15520
- }
15521
-
15522
15938
  if (params.seed == LLAMA_DEFAULT_SEED) {
15523
15939
  params.seed = time(NULL);
15524
15940
  }
@@ -15554,7 +15970,17 @@ struct llama_context * llama_new_context_with_model(
15554
15970
 
15555
15971
  if (!hparams.vocab_only) {
15556
15972
  // initialize backends
15557
- #ifdef GGML_USE_METAL
15973
+ #if defined(GGML_USE_RPC)
15974
+ for (auto & server : model->rpc_servers) {
15975
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15976
+ if (backend == nullptr) {
15977
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15978
+ llama_free(ctx);
15979
+ return nullptr;
15980
+ }
15981
+ ctx->backends.push_back(backend);
15982
+ }
15983
+ #elif defined(GGML_USE_METAL)
15558
15984
  if (model->n_gpu_layers > 0) {
15559
15985
  ctx->backend_metal = ggml_backend_metal_init();
15560
15986
  if (ctx->backend_metal == nullptr) {
@@ -15710,7 +16136,11 @@ struct llama_context * llama_new_context_with_model(
15710
16136
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15711
16137
 
15712
16138
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15713
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
16139
+ bool pipeline_parallel =
16140
+ llama_get_device_count(*model) > 1 &&
16141
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
16142
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
16143
+ params.offload_kqv;
15714
16144
  #ifndef GGML_USE_CUDA
15715
16145
  // pipeline parallelism requires support for async compute and events
15716
16146
  // currently this is only implemented in the CUDA backend
@@ -15753,20 +16183,6 @@ struct llama_context * llama_new_context_with_model(
15753
16183
  }
15754
16184
  }
15755
16185
 
15756
- #ifdef GGML_USE_MPI
15757
- ctx->ctx_mpi = ggml_mpi_init();
15758
-
15759
- if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
15760
- // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
15761
- // TODO: needs fix after #3228
15762
- GGML_ASSERT(false && "not implemented");
15763
- //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
15764
- //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
15765
- llama_backend_free();
15766
- exit(1);
15767
- }
15768
- #endif
15769
-
15770
16186
  return ctx;
15771
16187
  }
15772
16188
 
@@ -15803,11 +16219,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15803
16219
  // these models do not use RoPE
15804
16220
  case LLM_ARCH_GPT2:
15805
16221
  case LLM_ARCH_GPTJ:
15806
- case LLM_ARCH_GPTNEOX:
15807
16222
  case LLM_ARCH_MPT:
15808
16223
  case LLM_ARCH_REFACT:
15809
16224
  case LLM_ARCH_BLOOM:
15810
16225
  case LLM_ARCH_MAMBA:
16226
+ case LLM_ARCH_JINA_BERT_V2:
15811
16227
  return LLAMA_ROPE_TYPE_NONE;
15812
16228
 
15813
16229
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -15822,13 +16238,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15822
16238
  case LLM_ARCH_XVERSE:
15823
16239
  case LLM_ARCH_COMMAND_R:
15824
16240
  case LLM_ARCH_OLMO:
16241
+ case LLM_ARCH_ARCTIC:
15825
16242
  return LLAMA_ROPE_TYPE_NORM;
15826
16243
 
15827
16244
  // the pairs of head values are offset by n_rot/2
15828
16245
  case LLM_ARCH_FALCON:
15829
16246
  case LLM_ARCH_GROK:
15830
16247
  case LLM_ARCH_DBRX:
15831
- case LLM_ARCH_PERSIMMON:
15832
16248
  case LLM_ARCH_BERT:
15833
16249
  case LLM_ARCH_NOMIC_BERT:
15834
16250
  case LLM_ARCH_STABLELM:
@@ -15839,6 +16255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15839
16255
  case LLM_ARCH_PHI3:
15840
16256
  case LLM_ARCH_GEMMA:
15841
16257
  case LLM_ARCH_STARCODER2:
16258
+ case LLM_ARCH_GPTNEOX:
15842
16259
  return LLAMA_ROPE_TYPE_NEOX;
15843
16260
 
15844
16261
  // all model arches should be listed explicitly here
@@ -15998,6 +16415,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
15998
16415
  }
15999
16416
 
16000
16417
  // make tensors
16418
+ cvec.tensors.reserve(model.hparams.n_layer);
16001
16419
  cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
16002
16420
  for (size_t il = 1; il < model.hparams.n_layer; il++) {
16003
16421
  struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
@@ -16006,6 +16424,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
16006
16424
  }
16007
16425
 
16008
16426
  // allocate tensors / buffers and zero
16427
+ cvec.ctxs.reserve(ctx_map.size());
16428
+ cvec.bufs.reserve(ctx_map.size());
16009
16429
  for (auto it : ctx_map) {
16010
16430
  ggml_backend_buffer_type_t buft = it.first;
16011
16431
  ggml_context * ctx = it.second;
@@ -16829,13 +17249,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16829
17249
  }
16830
17250
  else {
16831
17251
  if (cell_range_begin != kv_self.size) {
16832
- cell_ranges.push_back({ cell_range_begin, i });
17252
+ cell_ranges.emplace_back(cell_range_begin, i);
16833
17253
  cell_range_begin = kv_self.size;
16834
17254
  }
16835
17255
  }
16836
17256
  }
16837
17257
  if (cell_range_begin != kv_self.size) {
16838
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17258
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16839
17259
  }
16840
17260
 
16841
17261
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -17214,6 +17634,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
17214
17634
  ctx->cparams.n_threads_batch = n_threads_batch;
17215
17635
  }
17216
17636
 
17637
+ uint32_t llama_n_threads(struct llama_context * ctx) {
17638
+ return ctx->cparams.n_threads;
17639
+ }
17640
+
17641
+ uint32_t llama_n_threads_batch(struct llama_context * ctx) {
17642
+ return ctx->cparams.n_threads_batch;
17643
+ }
17644
+
17217
17645
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
17218
17646
  ctx->abort_callback = abort_callback;
17219
17647
  ctx->abort_callback_data = abort_callback_data;
@@ -17437,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17437
17865
  );
17438
17866
  }
17439
17867
 
17868
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
17869
+ return llama_is_control_token(model->vocab, token);
17870
+ }
17871
+
17440
17872
  llama_token llama_token_bos(const struct llama_model * model) {
17441
17873
  return model->vocab.special_bos_id;
17442
17874
  }
@@ -17648,6 +18080,15 @@ static int32_t llama_chat_apply_template_internal(
17648
18080
  }
17649
18081
  }
17650
18082
  // llama2 templates seem to not care about "add_generation_prompt"
18083
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
18084
+ // Phi 3
18085
+ for (auto message : chat) {
18086
+ std::string role(message->role);
18087
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
18088
+ }
18089
+ if (add_ass) {
18090
+ ss << "<|assistant|>\n";
18091
+ }
17651
18092
  } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
17652
18093
  // zephyr template
17653
18094
  for (auto message : chat) {
@@ -17780,15 +18221,6 @@ static int32_t llama_chat_apply_template_internal(
17780
18221
  if (add_ass) {
17781
18222
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17782
18223
  }
17783
- } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17784
- // Phi 3
17785
- for (auto message : chat) {
17786
- std::string role(message->role);
17787
- ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17788
- }
17789
- if (add_ass) {
17790
- ss << "<|assistant|>\n";
17791
- }
17792
18224
  } else {
17793
18225
  // template not supported
17794
18226
  return -1;
@@ -17910,8 +18342,10 @@ const char * llama_print_system_info(void) {
17910
18342
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
17911
18343
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
17912
18344
  s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18345
+ s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
17913
18346
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
17914
18347
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18348
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
17915
18349
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
17916
18350
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
17917
18351
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
@@ -17970,6 +18404,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
17970
18404
  g_state.log_callback_user_data = user_data;
17971
18405
  #ifdef GGML_USE_METAL
17972
18406
  ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
18407
+ #elif defined(GGML_USE_CUDA)
18408
+ ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
17973
18409
  #endif
17974
18410
  }
17975
18411