llama_cpp 0.15.1 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -7,6 +7,10 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
+
#ifdef GGML_USE_RPC
|
11
|
+
# include "ggml-rpc.h"
|
12
|
+
#endif
|
13
|
+
|
10
14
|
#ifdef GGML_USE_CUDA
|
11
15
|
# include "ggml-cuda.h"
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -22,16 +26,9 @@
|
|
22
26
|
#ifdef GGML_USE_METAL
|
23
27
|
# include "ggml-metal.h"
|
24
28
|
#endif
|
25
|
-
|
26
|
-
|
27
|
-
#
|
28
|
-
#ifndef QK_K
|
29
|
-
# ifdef GGML_QKK_64
|
30
|
-
# define QK_K 64
|
31
|
-
# else
|
32
|
-
# define QK_K 256
|
33
|
-
# endif
|
34
|
-
#endif
|
29
|
+
|
30
|
+
// TODO: replace with ggml API call
|
31
|
+
#define QK_K 256
|
35
32
|
|
36
33
|
#ifdef __has_include
|
37
34
|
#if __has_include(<unistd.h>)
|
@@ -106,7 +103,7 @@
|
|
106
103
|
#endif
|
107
104
|
|
108
105
|
#define LLAMA_MAX_NODES 8192
|
109
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 128
|
110
107
|
|
111
108
|
//
|
112
109
|
// logging
|
@@ -201,10 +198,10 @@ enum llm_arch {
|
|
201
198
|
LLM_ARCH_GPTNEOX,
|
202
199
|
LLM_ARCH_MPT,
|
203
200
|
LLM_ARCH_STARCODER,
|
204
|
-
LLM_ARCH_PERSIMMON,
|
205
201
|
LLM_ARCH_REFACT,
|
206
202
|
LLM_ARCH_BERT,
|
207
203
|
LLM_ARCH_NOMIC_BERT,
|
204
|
+
LLM_ARCH_JINA_BERT_V2,
|
208
205
|
LLM_ARCH_BLOOM,
|
209
206
|
LLM_ARCH_STABLELM,
|
210
207
|
LLM_ARCH_QWEN,
|
@@ -224,43 +221,45 @@ enum llm_arch {
|
|
224
221
|
LLM_ARCH_COMMAND_R,
|
225
222
|
LLM_ARCH_DBRX,
|
226
223
|
LLM_ARCH_OLMO,
|
224
|
+
LLM_ARCH_ARCTIC,
|
227
225
|
LLM_ARCH_UNKNOWN,
|
228
226
|
};
|
229
227
|
|
230
228
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
240
|
-
{
|
241
|
-
{
|
242
|
-
{
|
243
|
-
{
|
244
|
-
{ LLM_ARCH_BLOOM, "bloom"
|
245
|
-
{ LLM_ARCH_STABLELM, "stablelm"
|
246
|
-
{ LLM_ARCH_QWEN, "qwen"
|
247
|
-
{ LLM_ARCH_QWEN2, "qwen2"
|
248
|
-
{ LLM_ARCH_QWEN2MOE, "qwen2moe"
|
249
|
-
{ LLM_ARCH_PHI2, "phi2"
|
250
|
-
{ LLM_ARCH_PHI3, "phi3"
|
251
|
-
{ LLM_ARCH_PLAMO, "plamo"
|
252
|
-
{ LLM_ARCH_CODESHELL, "codeshell"
|
253
|
-
{ LLM_ARCH_ORION, "orion"
|
254
|
-
{ LLM_ARCH_INTERNLM2, "internlm2"
|
255
|
-
{ LLM_ARCH_MINICPM, "minicpm"
|
256
|
-
{ LLM_ARCH_GEMMA, "gemma"
|
257
|
-
{ LLM_ARCH_STARCODER2, "starcoder2"
|
258
|
-
{ LLM_ARCH_MAMBA, "mamba"
|
259
|
-
{ LLM_ARCH_XVERSE, "xverse"
|
260
|
-
{ LLM_ARCH_COMMAND_R, "command-r"
|
261
|
-
{ LLM_ARCH_DBRX, "dbrx"
|
262
|
-
{ LLM_ARCH_OLMO, "olmo"
|
263
|
-
{
|
229
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
230
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
231
|
+
{ LLM_ARCH_GROK, "grok" },
|
232
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
233
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
234
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
235
|
+
{ LLM_ARCH_MPT, "mpt" },
|
236
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
237
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
238
|
+
{ LLM_ARCH_REFACT, "refact" },
|
239
|
+
{ LLM_ARCH_BERT, "bert" },
|
240
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
241
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
242
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
243
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
244
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
245
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
246
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
247
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
248
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
249
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
250
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
251
|
+
{ LLM_ARCH_ORION, "orion" },
|
252
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
253
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
254
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
255
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
256
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
257
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
258
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
261
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
262
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
264
263
|
};
|
265
264
|
|
266
265
|
enum llm_kv {
|
@@ -303,6 +302,7 @@ enum llm_kv {
|
|
303
302
|
LLM_KV_ROPE_SCALE_LINEAR,
|
304
303
|
LLM_KV_ROPE_SCALING_TYPE,
|
305
304
|
LLM_KV_ROPE_SCALING_FACTOR,
|
305
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
306
306
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
307
307
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
308
308
|
|
@@ -380,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
380
380
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
381
381
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
382
382
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
383
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
383
384
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
384
385
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
385
386
|
|
@@ -435,6 +436,8 @@ enum llm_tensor {
|
|
435
436
|
LLM_TENSOR_OUTPUT,
|
436
437
|
LLM_TENSOR_OUTPUT_NORM,
|
437
438
|
LLM_TENSOR_ROPE_FREQS,
|
439
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
440
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
438
441
|
LLM_TENSOR_ATTN_Q,
|
439
442
|
LLM_TENSOR_ATTN_K,
|
440
443
|
LLM_TENSOR_ATTN_V,
|
@@ -454,6 +457,7 @@ enum llm_tensor {
|
|
454
457
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
455
458
|
LLM_TENSOR_FFN_GATE_EXP,
|
456
459
|
LLM_TENSOR_FFN_UP_EXP,
|
460
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
457
461
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
458
462
|
LLM_TENSOR_FFN_GATE_EXPS,
|
459
463
|
LLM_TENSOR_FFN_UP_EXPS,
|
@@ -592,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
592
596
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
593
597
|
},
|
594
598
|
},
|
595
|
-
{
|
596
|
-
LLM_ARCH_PERSIMMON,
|
597
|
-
{
|
598
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
599
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
600
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
601
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
602
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
603
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
604
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
605
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
606
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
607
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
608
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
609
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
610
|
-
},
|
611
|
-
},
|
612
599
|
{
|
613
600
|
LLM_ARCH_MPT,
|
614
601
|
{
|
@@ -691,6 +678,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
691
678
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
692
679
|
},
|
693
680
|
},
|
681
|
+
{
|
682
|
+
LLM_ARCH_JINA_BERT_V2,
|
683
|
+
{
|
684
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
685
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
686
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
687
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
688
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
689
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
690
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
691
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
692
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
693
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
694
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
695
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
696
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
697
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
698
|
+
},
|
699
|
+
},
|
694
700
|
{
|
695
701
|
LLM_ARCH_BLOOM,
|
696
702
|
{
|
@@ -800,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
800
806
|
{
|
801
807
|
LLM_ARCH_PHI3,
|
802
808
|
{
|
803
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
804
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
805
|
-
{ LLM_TENSOR_OUTPUT,
|
806
|
-
{
|
807
|
-
{
|
808
|
-
{
|
809
|
-
{
|
810
|
-
{
|
811
|
-
{
|
812
|
-
{
|
813
|
-
{
|
814
|
-
{
|
809
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
810
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
811
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
812
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
813
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
815
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
816
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
817
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
815
823
|
},
|
816
824
|
},
|
817
825
|
{
|
@@ -1027,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1027
1035
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1028
1036
|
},
|
1029
1037
|
},
|
1038
|
+
{
|
1039
|
+
LLM_ARCH_ARCTIC,
|
1040
|
+
{
|
1041
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1042
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1043
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1044
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1045
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1046
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1047
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1048
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1049
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1050
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1051
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1052
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1053
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1054
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
1055
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1056
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1057
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
|
+
},
|
1059
|
+
},
|
1030
1060
|
{
|
1031
1061
|
LLM_ARCH_UNKNOWN,
|
1032
1062
|
{
|
@@ -1664,91 +1694,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1664
1694
|
GGML_UNUSED(host_buffer);
|
1665
1695
|
}
|
1666
1696
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1669
|
-
|
1670
|
-
#ifdef GGML_USE_METAL
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
1682
|
-
if (buft == nullptr) {
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1684
|
-
}
|
1685
|
-
#endif
|
1686
|
-
|
1687
|
-
if (buft == nullptr) {
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
1689
|
-
}
|
1690
|
-
return buft;
|
1691
|
-
|
1692
|
-
GGML_UNUSED(gpu);
|
1693
|
-
}
|
1694
|
-
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1697
|
-
|
1698
|
-
#ifdef GGML_USE_CUDA
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1701
|
-
}
|
1702
|
-
#endif
|
1703
|
-
|
1704
|
-
#ifdef GGML_USE_SYCL
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1707
|
-
}
|
1708
|
-
#endif
|
1709
|
-
|
1710
|
-
if (buft == nullptr) {
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1712
|
-
}
|
1713
|
-
return buft;
|
1714
|
-
|
1715
|
-
GGML_UNUSED(tensor_split);
|
1716
|
-
}
|
1717
|
-
|
1718
|
-
static size_t llama_get_device_count() {
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
1725
|
-
#else
|
1726
|
-
return 1;
|
1727
|
-
#endif
|
1728
|
-
}
|
1729
|
-
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
1732
|
-
size_t total;
|
1733
|
-
size_t free;
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1735
|
-
return free;
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
1737
|
-
size_t total;
|
1738
|
-
size_t free;
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1740
|
-
return free;
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
1742
|
-
size_t total;
|
1743
|
-
size_t free;
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1745
|
-
return free;
|
1746
|
-
#else
|
1747
|
-
return 1;
|
1748
|
-
GGML_UNUSED(device);
|
1749
|
-
#endif
|
1750
|
-
}
|
1751
|
-
|
1752
1697
|
//
|
1753
1698
|
// globals
|
1754
1699
|
//
|
@@ -1757,6 +1702,8 @@ struct llama_state {
|
|
1757
1702
|
llama_state() {
|
1758
1703
|
#ifdef GGML_USE_METAL
|
1759
1704
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1705
|
+
#elif defined(GGML_USE_CUDA)
|
1706
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
1760
1707
|
#endif
|
1761
1708
|
}
|
1762
1709
|
|
@@ -1770,17 +1717,24 @@ static llama_state g_state;
|
|
1770
1717
|
// available llama models
|
1771
1718
|
enum e_model {
|
1772
1719
|
MODEL_UNKNOWN,
|
1720
|
+
MODEL_14M,
|
1773
1721
|
MODEL_17M,
|
1774
1722
|
MODEL_22M,
|
1775
1723
|
MODEL_33M,
|
1724
|
+
MODEL_70M,
|
1776
1725
|
MODEL_109M,
|
1777
1726
|
MODEL_137M,
|
1727
|
+
MODEL_160M,
|
1778
1728
|
MODEL_335M,
|
1729
|
+
MODEL_410M,
|
1779
1730
|
MODEL_0_5B,
|
1780
1731
|
MODEL_1B,
|
1732
|
+
MODEL_1_4B,
|
1781
1733
|
MODEL_2B,
|
1734
|
+
MODEL_2_8B,
|
1782
1735
|
MODEL_3B,
|
1783
1736
|
MODEL_4B,
|
1737
|
+
MODEL_6_9B,
|
1784
1738
|
MODEL_7B,
|
1785
1739
|
MODEL_8B,
|
1786
1740
|
MODEL_12B,
|
@@ -1803,6 +1757,7 @@ enum e_model {
|
|
1803
1757
|
MODEL_8x7B,
|
1804
1758
|
MODEL_8x22B,
|
1805
1759
|
MODEL_16x12B,
|
1760
|
+
MODEL_10B_128x3_66B,
|
1806
1761
|
};
|
1807
1762
|
|
1808
1763
|
static const size_t kiB = 1024;
|
@@ -1812,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
|
1812
1767
|
struct llama_hparams {
|
1813
1768
|
bool vocab_only;
|
1814
1769
|
bool rope_finetuned;
|
1770
|
+
bool use_par_res;
|
1815
1771
|
|
1816
1772
|
uint32_t n_vocab;
|
1817
1773
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -1830,6 +1786,7 @@ struct llama_hparams {
|
|
1830
1786
|
float f_norm_eps;
|
1831
1787
|
float f_norm_rms_eps;
|
1832
1788
|
|
1789
|
+
float rope_attn_factor = 1.0f;
|
1833
1790
|
float rope_freq_base_train;
|
1834
1791
|
float rope_freq_scale_train;
|
1835
1792
|
uint32_t n_yarn_orig_ctx;
|
@@ -1845,7 +1802,7 @@ struct llama_hparams {
|
|
1845
1802
|
float f_logit_scale = 0.0f;
|
1846
1803
|
|
1847
1804
|
bool causal_attn = true;
|
1848
|
-
bool use_alibi = false;
|
1805
|
+
bool use_alibi = false;
|
1849
1806
|
|
1850
1807
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1851
1808
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1878,6 +1835,7 @@ struct llama_hparams {
|
|
1878
1835
|
|
1879
1836
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1880
1837
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1838
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1881
1839
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1882
1840
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1883
1841
|
|
@@ -1975,6 +1933,7 @@ struct llama_layer {
|
|
1975
1933
|
struct ggml_tensor * ffn_norm_b;
|
1976
1934
|
struct ggml_tensor * layer_out_norm;
|
1977
1935
|
struct ggml_tensor * layer_out_norm_b;
|
1936
|
+
struct ggml_tensor * ffn_norm_exps;
|
1978
1937
|
|
1979
1938
|
// ff
|
1980
1939
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -2012,6 +1971,10 @@ struct llama_layer {
|
|
2012
1971
|
// mamba bias
|
2013
1972
|
struct ggml_tensor * ssm_conv1d_b;
|
2014
1973
|
struct ggml_tensor * ssm_dt_b;
|
1974
|
+
|
1975
|
+
// long rope factors
|
1976
|
+
struct ggml_tensor * rope_long = nullptr;
|
1977
|
+
struct ggml_tensor * rope_short = nullptr;
|
2015
1978
|
};
|
2016
1979
|
|
2017
1980
|
struct llama_kv_cell {
|
@@ -2189,6 +2152,8 @@ struct llama_model {
|
|
2189
2152
|
int main_gpu;
|
2190
2153
|
int n_gpu_layers;
|
2191
2154
|
|
2155
|
+
std::vector<std::string> rpc_servers;
|
2156
|
+
|
2192
2157
|
// gguf metadata
|
2193
2158
|
std::unordered_map<std::string, std::string> gguf_kv;
|
2194
2159
|
|
@@ -2317,7 +2282,6 @@ struct llama_context {
|
|
2317
2282
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2318
2283
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2319
2284
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2321
2285
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2322
2286
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2323
2287
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
@@ -2327,11 +2291,105 @@ struct llama_context {
|
|
2327
2291
|
|
2328
2292
|
// control vectors
|
2329
2293
|
struct llama_control_vector cvec;
|
2294
|
+
};
|
2295
|
+
|
2296
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2297
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2298
|
+
|
2299
|
+
#ifdef GGML_USE_RPC
|
2300
|
+
std::string endpoint = model.rpc_servers[gpu];
|
2301
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
2302
|
+
#elif defined(GGML_USE_METAL)
|
2303
|
+
buft = ggml_backend_metal_buffer_type();
|
2304
|
+
#elif defined(GGML_USE_CUDA)
|
2305
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
2306
|
+
#elif defined(GGML_USE_VULKAN)
|
2307
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
2308
|
+
#elif defined(GGML_USE_SYCL)
|
2309
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
2310
|
+
#elif defined(GGML_USE_CLBLAST)
|
2311
|
+
buft = ggml_backend_opencl_buffer_type();
|
2312
|
+
#elif defined(GGML_USE_KOMPUTE)
|
2313
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
2314
|
+
if (buft == nullptr) {
|
2315
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
2316
|
+
}
|
2317
|
+
#endif
|
2318
|
+
|
2319
|
+
if (buft == nullptr) {
|
2320
|
+
buft = llama_default_buffer_type_cpu(true);
|
2321
|
+
}
|
2322
|
+
return buft;
|
2323
|
+
GGML_UNUSED(model);
|
2324
|
+
GGML_UNUSED(gpu);
|
2325
|
+
}
|
2330
2326
|
|
2331
|
-
|
2332
|
-
|
2327
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
2328
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2329
|
+
|
2330
|
+
#ifdef GGML_USE_CUDA
|
2331
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
2332
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
2333
|
+
}
|
2333
2334
|
#endif
|
2334
|
-
|
2335
|
+
|
2336
|
+
#ifdef GGML_USE_SYCL
|
2337
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
2338
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
2339
|
+
}
|
2340
|
+
#endif
|
2341
|
+
|
2342
|
+
if (buft == nullptr) {
|
2343
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
2344
|
+
}
|
2345
|
+
return buft;
|
2346
|
+
|
2347
|
+
GGML_UNUSED(tensor_split);
|
2348
|
+
}
|
2349
|
+
|
2350
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2351
|
+
#if defined(GGML_USE_RPC)
|
2352
|
+
return model.rpc_servers.size();
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
2354
|
+
return ggml_backend_cuda_get_device_count();
|
2355
|
+
#elif defined(GGML_USE_SYCL)
|
2356
|
+
return ggml_backend_sycl_get_device_count();
|
2357
|
+
#elif defined(GGML_USE_VULKAN)
|
2358
|
+
return ggml_backend_vk_get_device_count();
|
2359
|
+
#else
|
2360
|
+
return 1;
|
2361
|
+
#endif
|
2362
|
+
GGML_UNUSED(model);
|
2363
|
+
}
|
2364
|
+
|
2365
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2366
|
+
#if defined(GGML_USE_RPC)
|
2367
|
+
size_t total;
|
2368
|
+
size_t free;
|
2369
|
+
std::string endpoint = model.rpc_servers[device];
|
2370
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
2371
|
+
return free;
|
2372
|
+
#elif defined(GGML_USE_CUDA)
|
2373
|
+
size_t total;
|
2374
|
+
size_t free;
|
2375
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
2376
|
+
return free;
|
2377
|
+
#elif defined(GGML_USE_SYCL)
|
2378
|
+
size_t total;
|
2379
|
+
size_t free;
|
2380
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
2381
|
+
return free;
|
2382
|
+
#elif defined(GGML_USE_VULKAN)
|
2383
|
+
size_t total;
|
2384
|
+
size_t free;
|
2385
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
2386
|
+
return free;
|
2387
|
+
#else
|
2388
|
+
return 1;
|
2389
|
+
#endif
|
2390
|
+
GGML_UNUSED(model);
|
2391
|
+
GGML_UNUSED(device);
|
2392
|
+
}
|
2335
2393
|
|
2336
2394
|
//
|
2337
2395
|
// kv cache helpers
|
@@ -2452,7 +2510,6 @@ static bool llama_kv_cache_init(
|
|
2452
2510
|
static bool llama_kv_cache_find_slot(
|
2453
2511
|
struct llama_kv_cache & cache,
|
2454
2512
|
const struct llama_batch & batch) {
|
2455
|
-
const uint32_t n_ctx = cache.size;
|
2456
2513
|
const uint32_t n_tokens = batch.n_tokens;
|
2457
2514
|
|
2458
2515
|
if (cache.recurrent) {
|
@@ -2503,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
|
2503
2560
|
}
|
2504
2561
|
// otherwise, one cell per token.
|
2505
2562
|
|
2506
|
-
if (n_tokens >
|
2507
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
2563
|
+
if (n_tokens > cache.size) {
|
2564
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
2508
2565
|
return false;
|
2509
2566
|
}
|
2510
2567
|
|
2511
2568
|
uint32_t n_tested = 0;
|
2512
2569
|
|
2513
2570
|
while (true) {
|
2514
|
-
if (cache.head + n_tokens >
|
2515
|
-
n_tested +=
|
2571
|
+
if (cache.head + n_tokens > cache.size) {
|
2572
|
+
n_tested += cache.size - cache.head;
|
2516
2573
|
cache.head = 0;
|
2517
2574
|
continue;
|
2518
2575
|
}
|
@@ -2531,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
|
2531
2588
|
break;
|
2532
2589
|
}
|
2533
2590
|
|
2534
|
-
if (n_tested >=
|
2591
|
+
if (n_tested >= cache.size) {
|
2535
2592
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
2536
2593
|
return false;
|
2537
2594
|
}
|
@@ -2785,6 +2842,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
2785
2842
|
cache.do_defrag = true;
|
2786
2843
|
}
|
2787
2844
|
|
2845
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
2846
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
2847
|
+
return cparams.flash_attn ? 256u : 32u;
|
2848
|
+
}
|
2849
|
+
|
2788
2850
|
//
|
2789
2851
|
// model loading and saving
|
2790
2852
|
//
|
@@ -3287,22 +3349,55 @@ struct llama_model_loader {
|
|
3287
3349
|
}
|
3288
3350
|
|
3289
3351
|
template<typename T>
|
3290
|
-
bool
|
3291
|
-
|
3352
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
3353
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
3292
3354
|
|
3293
|
-
|
3294
|
-
|
3355
|
+
if (kid < 0) {
|
3356
|
+
if (required) {
|
3357
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3358
|
+
}
|
3359
|
+
return false;
|
3360
|
+
}
|
3295
3361
|
|
3296
|
-
|
3362
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
3363
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
3297
3364
|
|
3298
|
-
if (
|
3299
|
-
throw std::runtime_error(format("
|
3365
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
3366
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
3300
3367
|
}
|
3301
3368
|
|
3302
|
-
|
3303
|
-
|
3369
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
3370
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
3371
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
3304
3372
|
|
3305
|
-
|
3373
|
+
result.resize(arr_info.length);
|
3374
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
3375
|
+
|
3376
|
+
return true;
|
3377
|
+
}
|
3378
|
+
|
3379
|
+
template<typename T>
|
3380
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
3381
|
+
return get_arr(llm_kv(kid), result, required);
|
3382
|
+
}
|
3383
|
+
|
3384
|
+
template<typename T>
|
3385
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
3386
|
+
auto it = kv_overrides.find(key);
|
3387
|
+
|
3388
|
+
const struct llama_model_kv_override * override =
|
3389
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
3390
|
+
|
3391
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
3392
|
+
|
3393
|
+
if (required && !found) {
|
3394
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3395
|
+
}
|
3396
|
+
|
3397
|
+
return found;
|
3398
|
+
}
|
3399
|
+
|
3400
|
+
template<typename T>
|
3306
3401
|
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
3307
3402
|
return get_key(llm_kv(kid), result, required);
|
3308
3403
|
}
|
@@ -3360,11 +3455,15 @@ struct llama_model_loader {
|
|
3360
3455
|
return get_tensor_meta(get_tensor_name(i));
|
3361
3456
|
}
|
3362
3457
|
|
3363
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3458
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
3364
3459
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3365
3460
|
ggml_set_name(tensor, ggml_get_name(cur));
|
3366
3461
|
|
3367
|
-
|
3462
|
+
if (duplicated) {
|
3463
|
+
size_data += ggml_nbytes(cur);
|
3464
|
+
} else {
|
3465
|
+
n_created++;
|
3466
|
+
}
|
3368
3467
|
|
3369
3468
|
return tensor;
|
3370
3469
|
}
|
@@ -3399,14 +3498,17 @@ struct llama_model_loader {
|
|
3399
3498
|
return cur;
|
3400
3499
|
}
|
3401
3500
|
|
3402
|
-
|
3403
|
-
|
3501
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
3502
|
+
static const int TENSOR_DUPLICATED = 2;
|
3503
|
+
|
3504
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
3505
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
3404
3506
|
|
3405
3507
|
if (cur == NULL) {
|
3406
3508
|
return NULL;
|
3407
3509
|
}
|
3408
3510
|
|
3409
|
-
return create_tensor_for(ctx, cur);
|
3511
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
3410
3512
|
}
|
3411
3513
|
|
3412
3514
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
@@ -3706,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3706
3808
|
|
3707
3809
|
static const char * llama_model_type_name(e_model type) {
|
3708
3810
|
switch (type) {
|
3709
|
-
case
|
3710
|
-
case
|
3711
|
-
case
|
3712
|
-
case
|
3713
|
-
case
|
3714
|
-
case
|
3715
|
-
case
|
3716
|
-
case
|
3717
|
-
case
|
3718
|
-
case
|
3719
|
-
case
|
3720
|
-
case
|
3721
|
-
case
|
3722
|
-
case
|
3723
|
-
case
|
3724
|
-
case
|
3725
|
-
case
|
3726
|
-
case
|
3727
|
-
case
|
3728
|
-
case
|
3729
|
-
case
|
3730
|
-
case
|
3731
|
-
case
|
3732
|
-
case
|
3733
|
-
case
|
3734
|
-
case
|
3735
|
-
case
|
3736
|
-
case
|
3737
|
-
case
|
3738
|
-
case
|
3739
|
-
|
3811
|
+
case MODEL_14M: return "14M";
|
3812
|
+
case MODEL_17M: return "17M";
|
3813
|
+
case MODEL_22M: return "22M";
|
3814
|
+
case MODEL_33M: return "33M";
|
3815
|
+
case MODEL_70M: return "70M";
|
3816
|
+
case MODEL_109M: return "109M";
|
3817
|
+
case MODEL_137M: return "137M";
|
3818
|
+
case MODEL_160M: return "160M";
|
3819
|
+
case MODEL_335M: return "335M";
|
3820
|
+
case MODEL_410M: return "410M";
|
3821
|
+
case MODEL_0_5B: return "0.5B";
|
3822
|
+
case MODEL_1B: return "1B";
|
3823
|
+
case MODEL_1_4B: return "1.4B";
|
3824
|
+
case MODEL_2B: return "2B";
|
3825
|
+
case MODEL_2_8B: return "2.8B";
|
3826
|
+
case MODEL_3B: return "3B";
|
3827
|
+
case MODEL_4B: return "4B";
|
3828
|
+
case MODEL_6_9B: return "6.9B";
|
3829
|
+
case MODEL_7B: return "7B";
|
3830
|
+
case MODEL_8B: return "8B";
|
3831
|
+
case MODEL_12B: return "12B";
|
3832
|
+
case MODEL_13B: return "13B";
|
3833
|
+
case MODEL_14B: return "14B";
|
3834
|
+
case MODEL_15B: return "15B";
|
3835
|
+
case MODEL_20B: return "20B";
|
3836
|
+
case MODEL_30B: return "30B";
|
3837
|
+
case MODEL_34B: return "34B";
|
3838
|
+
case MODEL_35B: return "35B";
|
3839
|
+
case MODEL_40B: return "40B";
|
3840
|
+
case MODEL_65B: return "65B";
|
3841
|
+
case MODEL_70B: return "70B";
|
3842
|
+
case MODEL_314B: return "314B";
|
3843
|
+
case MODEL_SMALL: return "0.1B";
|
3844
|
+
case MODEL_MEDIUM: return "0.4B";
|
3845
|
+
case MODEL_LARGE: return "0.8B";
|
3846
|
+
case MODEL_XL: return "1.5B";
|
3847
|
+
case MODEL_A2_7B: return "A2.7B";
|
3848
|
+
case MODEL_8x7B: return "8x7B";
|
3849
|
+
case MODEL_8x22B: return "8x22B";
|
3850
|
+
case MODEL_16x12B: return "16x12B";
|
3851
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
3852
|
+
default: return "?B";
|
3740
3853
|
}
|
3741
3854
|
}
|
3742
3855
|
|
@@ -3779,6 +3892,12 @@ static void llm_load_hparams(
|
|
3779
3892
|
|
3780
3893
|
// get hparams kv
|
3781
3894
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3895
|
+
|
3896
|
+
// everything past this point is not vocab-related
|
3897
|
+
if (hparams.vocab_only) {
|
3898
|
+
return;
|
3899
|
+
}
|
3900
|
+
|
3782
3901
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3783
3902
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3784
3903
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
@@ -3823,6 +3942,8 @@ static void llm_load_hparams(
|
|
3823
3942
|
}
|
3824
3943
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
3825
3944
|
|
3945
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
3946
|
+
|
3826
3947
|
// sanity check for n_rot (optional)
|
3827
3948
|
{
|
3828
3949
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
@@ -3860,7 +3981,7 @@ static void llm_load_hparams(
|
|
3860
3981
|
switch (hparams.n_layer) {
|
3861
3982
|
case 22: model.type = e_model::MODEL_1B; break;
|
3862
3983
|
case 26: model.type = e_model::MODEL_3B; break;
|
3863
|
-
case 32: model.type = hparams.
|
3984
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
3864
3985
|
case 40: model.type = e_model::MODEL_13B; break;
|
3865
3986
|
case 48: model.type = e_model::MODEL_34B; break;
|
3866
3987
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3922,14 +4043,6 @@ static void llm_load_hparams(
|
|
3922
4043
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3923
4044
|
}
|
3924
4045
|
} break;
|
3925
|
-
case LLM_ARCH_PERSIMMON:
|
3926
|
-
{
|
3927
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3928
|
-
switch (hparams.n_layer) {
|
3929
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
3930
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
3931
|
-
}
|
3932
|
-
} break;
|
3933
4046
|
case LLM_ARCH_REFACT:
|
3934
4047
|
{
|
3935
4048
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -3962,6 +4075,19 @@ static void llm_load_hparams(
|
|
3962
4075
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3963
4076
|
}
|
3964
4077
|
} break;
|
4078
|
+
case LLM_ARCH_JINA_BERT_V2:
|
4079
|
+
{
|
4080
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
4082
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
4083
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
4084
|
+
hparams.f_max_alibi_bias = 8.0f;
|
4085
|
+
|
4086
|
+
switch (hparams.n_layer) {
|
4087
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
4088
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
4089
|
+
}
|
4090
|
+
} break;
|
3965
4091
|
case LLM_ARCH_NOMIC_BERT:
|
3966
4092
|
{
|
3967
4093
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -4058,6 +4184,7 @@ static void llm_load_hparams(
|
|
4058
4184
|
switch (hparams.n_layer) {
|
4059
4185
|
case 24: model.type = e_model::MODEL_1B; break;
|
4060
4186
|
case 32: model.type = e_model::MODEL_3B; break;
|
4187
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
4061
4188
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4062
4189
|
}
|
4063
4190
|
} break;
|
@@ -4198,6 +4325,65 @@ static void llm_load_hparams(
|
|
4198
4325
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4199
4326
|
}
|
4200
4327
|
} break;
|
4328
|
+
case LLM_ARCH_GPTNEOX:
|
4329
|
+
{
|
4330
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4331
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
4332
|
+
switch (hparams.n_layer) {
|
4333
|
+
case 6:
|
4334
|
+
switch (hparams.n_ff) {
|
4335
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
4336
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
4337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4338
|
+
} break;
|
4339
|
+
case 12:
|
4340
|
+
switch (hparams.n_ff) {
|
4341
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
4342
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4343
|
+
} break;
|
4344
|
+
case 16:
|
4345
|
+
switch (hparams.n_ff) {
|
4346
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
4347
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4348
|
+
} break;
|
4349
|
+
case 24:
|
4350
|
+
switch (hparams.n_ff) {
|
4351
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
4352
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
4353
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4354
|
+
} break;
|
4355
|
+
case 32:
|
4356
|
+
switch (hparams.n_ff) {
|
4357
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
4358
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
4359
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4360
|
+
} break;
|
4361
|
+
case 36:
|
4362
|
+
switch (hparams.n_ff) {
|
4363
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
4364
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4365
|
+
} break;
|
4366
|
+
case 44:
|
4367
|
+
switch (hparams.n_ff) {
|
4368
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
4369
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4370
|
+
} break;
|
4371
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4372
|
+
}
|
4373
|
+
} break;
|
4374
|
+
case LLM_ARCH_ARCTIC:
|
4375
|
+
{
|
4376
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4377
|
+
|
4378
|
+
if (hparams.n_expert == 128) {
|
4379
|
+
switch (hparams.n_layer) {
|
4380
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
4381
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4382
|
+
}
|
4383
|
+
} else {
|
4384
|
+
model.type = e_model::MODEL_UNKNOWN;
|
4385
|
+
}
|
4386
|
+
} break;
|
4201
4387
|
default: (void)0;
|
4202
4388
|
}
|
4203
4389
|
|
@@ -4383,7 +4569,11 @@ static void llm_load_vocab(
|
|
4383
4569
|
tokenizer_pre == "starcoder") {
|
4384
4570
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4385
4571
|
} else if (
|
4386
|
-
tokenizer_pre == "gpt-2"
|
4572
|
+
tokenizer_pre == "gpt-2" ||
|
4573
|
+
tokenizer_pre == "jina-es" ||
|
4574
|
+
tokenizer_pre == "jina-de" ||
|
4575
|
+
tokenizer_pre == "jina-v2-es" ||
|
4576
|
+
tokenizer_pre == "jina-v2-de") {
|
4387
4577
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4388
4578
|
} else if (
|
4389
4579
|
tokenizer_pre == "refact") {
|
@@ -4394,6 +4584,9 @@ static void llm_load_vocab(
|
|
4394
4584
|
} else if (
|
4395
4585
|
tokenizer_pre == "qwen2") {
|
4396
4586
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4587
|
+
} else if (
|
4588
|
+
tokenizer_pre == "stablelm2") {
|
4589
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
4397
4590
|
} else if (
|
4398
4591
|
tokenizer_pre == "olmo") {
|
4399
4592
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
@@ -4515,7 +4708,8 @@ static void llm_load_vocab(
|
|
4515
4708
|
(t.first == "<|eot_id|>" ||
|
4516
4709
|
t.first == "<|im_end|>" ||
|
4517
4710
|
t.first == "<|end|>" ||
|
4518
|
-
t.first == "<end_of_turn>"
|
4711
|
+
t.first == "<end_of_turn>" ||
|
4712
|
+
t.first == "<|endoftext|>"
|
4519
4713
|
)
|
4520
4714
|
) {
|
4521
4715
|
vocab.special_eot_id = t.second;
|
@@ -4743,13 +4937,13 @@ static bool llm_load_tensors(
|
|
4743
4937
|
|
4744
4938
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
4745
4939
|
// calculate the split points
|
4746
|
-
int device_count = llama_get_device_count();
|
4940
|
+
int device_count = llama_get_device_count(model);
|
4747
4941
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
4748
4942
|
std::vector<float> splits(device_count);
|
4749
4943
|
if (all_zero) {
|
4750
4944
|
// default split, by free memory
|
4751
4945
|
for (int i = 0; i < device_count; ++i) {
|
4752
|
-
splits[i] = llama_get_device_memory(i);
|
4946
|
+
splits[i] = llama_get_device_memory(model, i);
|
4753
4947
|
}
|
4754
4948
|
} else {
|
4755
4949
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
@@ -4769,35 +4963,35 @@ static bool llm_load_tensors(
|
|
4769
4963
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
4770
4964
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4771
4965
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
4772
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
4966
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
4773
4967
|
}
|
4774
4968
|
// assign the output layer
|
4775
4969
|
if (n_gpu_layers > n_layer) {
|
4776
4970
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
4777
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
4971
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
4778
4972
|
} else {
|
4779
4973
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
4780
4974
|
}
|
4781
4975
|
} else {
|
4782
4976
|
ggml_backend_buffer_type_t split_buft;
|
4783
4977
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
4784
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
4978
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
4785
4979
|
} else {
|
4786
4980
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
4787
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
4981
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
4788
4982
|
}
|
4789
4983
|
// assign the repeating layers
|
4790
4984
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4791
4985
|
model.buft_layer[i] = {
|
4792
4986
|
split_buft,
|
4793
|
-
llama_default_buffer_type_offload(main_gpu)
|
4987
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4794
4988
|
};
|
4795
4989
|
}
|
4796
4990
|
// assign the output layer
|
4797
4991
|
if (n_gpu_layers > n_layer) {
|
4798
4992
|
model.buft_output = {
|
4799
4993
|
split_buft,
|
4800
|
-
llama_default_buffer_type_offload(main_gpu)
|
4994
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4801
4995
|
};
|
4802
4996
|
} else {
|
4803
4997
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
@@ -4841,6 +5035,7 @@ static bool llm_load_tensors(
|
|
4841
5035
|
// create tensors for the weights
|
4842
5036
|
{
|
4843
5037
|
const int64_t n_embd = hparams.n_embd;
|
5038
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
4844
5039
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4845
5040
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4846
5041
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -4875,12 +5070,10 @@ static bool llm_load_tensors(
|
|
4875
5070
|
{
|
4876
5071
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4877
5072
|
if (model.arch != LLM_ARCH_MINICPM){
|
4878
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5073
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4879
5074
|
// if output is NULL, init from the input tok embed
|
4880
5075
|
if (model.output == NULL) {
|
4881
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4882
|
-
ml.n_created--; // artificial tensor
|
4883
|
-
ml.size_data += ggml_nbytes(model.output);
|
5076
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4884
5077
|
}
|
4885
5078
|
}
|
4886
5079
|
}
|
@@ -4899,10 +5092,10 @@ static bool llm_load_tensors(
|
|
4899
5092
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4900
5093
|
|
4901
5094
|
// optional bias tensors
|
4902
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
4903
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
4904
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
4905
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5095
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5096
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5097
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5098
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4906
5099
|
|
4907
5100
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4908
5101
|
|
@@ -4913,7 +5106,7 @@ static bool llm_load_tensors(
|
|
4913
5106
|
} else {
|
4914
5107
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4915
5108
|
|
4916
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5109
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4917
5110
|
if (layer.ffn_gate_exps) {
|
4918
5111
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4919
5112
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -4955,12 +5148,10 @@ static bool llm_load_tensors(
|
|
4955
5148
|
// output
|
4956
5149
|
{
|
4957
5150
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4958
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5151
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4959
5152
|
// if output is NULL, init from the input tok embed
|
4960
5153
|
if (model.output == NULL) {
|
4961
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4962
|
-
ml.n_created--; // artificial tensor
|
4963
|
-
ml.size_data += ggml_nbytes(model.output);
|
5154
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4964
5155
|
}
|
4965
5156
|
}
|
4966
5157
|
|
@@ -4983,7 +5174,7 @@ static bool llm_load_tensors(
|
|
4983
5174
|
|
4984
5175
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4985
5176
|
|
4986
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5177
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4987
5178
|
if (layer.ffn_gate_exps) {
|
4988
5179
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4989
5180
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5085,11 +5276,9 @@ static bool llm_load_tensors(
|
|
5085
5276
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5086
5277
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5087
5278
|
|
5088
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5279
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5089
5280
|
if (!model.output) {
|
5090
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5091
|
-
ml.n_created--; // artificial tensor
|
5092
|
-
ml.size_data += ggml_nbytes(model.output);
|
5281
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5093
5282
|
}
|
5094
5283
|
}
|
5095
5284
|
|
@@ -5102,8 +5291,8 @@ static bool llm_load_tensors(
|
|
5102
5291
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5103
5292
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5104
5293
|
|
5105
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
5106
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
5294
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5295
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5107
5296
|
|
5108
5297
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5109
5298
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -5121,7 +5310,12 @@ static bool llm_load_tensors(
|
|
5121
5310
|
{
|
5122
5311
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5123
5312
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5124
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5313
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5314
|
+
if (!model.output) {
|
5315
|
+
// needs to be on GPU
|
5316
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5317
|
+
}
|
5318
|
+
|
5125
5319
|
}
|
5126
5320
|
|
5127
5321
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5149,47 +5343,6 @@ static bool llm_load_tensors(
|
|
5149
5343
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5150
5344
|
}
|
5151
5345
|
} break;
|
5152
|
-
case LLM_ARCH_PERSIMMON:
|
5153
|
-
{
|
5154
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5155
|
-
|
5156
|
-
{
|
5157
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5158
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5159
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5160
|
-
}
|
5161
|
-
|
5162
|
-
for (int i = 0; i < n_layer; ++i) {
|
5163
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
5164
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5165
|
-
|
5166
|
-
auto & layer = model.layers[i];
|
5167
|
-
|
5168
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5169
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5170
|
-
|
5171
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5172
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
5173
|
-
|
5174
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5175
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
5176
|
-
|
5177
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5178
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5179
|
-
|
5180
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5181
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5182
|
-
|
5183
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5184
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
5185
|
-
|
5186
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
5187
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
5188
|
-
|
5189
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
5190
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
5191
|
-
}
|
5192
|
-
} break;
|
5193
5346
|
case LLM_ARCH_BERT:
|
5194
5347
|
case LLM_ARCH_NOMIC_BERT:
|
5195
5348
|
{
|
@@ -5242,6 +5395,50 @@ static bool llm_load_tensors(
|
|
5242
5395
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5243
5396
|
}
|
5244
5397
|
} break;
|
5398
|
+
case LLM_ARCH_JINA_BERT_V2:
|
5399
|
+
{
|
5400
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
5401
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
5402
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
5403
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
5404
|
+
|
5405
|
+
for (int i = 0; i < n_layer; ++i) {
|
5406
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5407
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5408
|
+
|
5409
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
5410
|
+
|
5411
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5412
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5413
|
+
|
5414
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5415
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5416
|
+
|
5417
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5418
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5419
|
+
|
5420
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5421
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5422
|
+
|
5423
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5424
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5425
|
+
|
5426
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
5427
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
5428
|
+
|
5429
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5430
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5431
|
+
|
5432
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5433
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5434
|
+
|
5435
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5436
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5437
|
+
|
5438
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
5439
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5440
|
+
}
|
5441
|
+
} break;
|
5245
5442
|
case LLM_ARCH_BLOOM:
|
5246
5443
|
{
|
5247
5444
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5283,18 +5480,16 @@ static bool llm_load_tensors(
|
|
5283
5480
|
case LLM_ARCH_MPT:
|
5284
5481
|
{
|
5285
5482
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5286
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
5483
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5287
5484
|
|
5288
5485
|
// output
|
5289
5486
|
{
|
5290
5487
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5291
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
5488
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5292
5489
|
|
5293
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5490
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5294
5491
|
if (!model.output) {
|
5295
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5296
|
-
ml.n_created--; // artificial tensor
|
5297
|
-
ml.size_data += ggml_nbytes(model.output);
|
5492
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5298
5493
|
}
|
5299
5494
|
}
|
5300
5495
|
|
@@ -5305,31 +5500,31 @@ static bool llm_load_tensors(
|
|
5305
5500
|
auto & layer = model.layers[i];
|
5306
5501
|
|
5307
5502
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5308
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
5503
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5309
5504
|
|
5310
5505
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5311
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5506
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5312
5507
|
|
5313
5508
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5314
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5509
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5315
5510
|
|
5316
5511
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5317
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5512
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5318
5513
|
|
5319
5514
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5320
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
5515
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5321
5516
|
|
5322
5517
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5323
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
5518
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5324
5519
|
|
5325
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5326
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5520
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5521
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5327
5522
|
|
5328
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5329
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5523
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5524
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5330
5525
|
|
5331
5526
|
// AWQ ScaleActivation layer
|
5332
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
5527
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5333
5528
|
}
|
5334
5529
|
} break;
|
5335
5530
|
case LLM_ARCH_STABLELM:
|
@@ -5358,17 +5553,17 @@ static bool llm_load_tensors(
|
|
5358
5553
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5359
5554
|
|
5360
5555
|
// optional bias tensors, present in Stable LM 2 1.6B
|
5361
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
5362
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
5363
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
5556
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5557
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5558
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5364
5559
|
|
5365
5560
|
// optional q and k layernorms, present in StableLM 2 12B
|
5366
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
5367
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
5561
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5562
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5368
5563
|
|
5369
5564
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5370
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
5371
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5565
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5566
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5372
5567
|
|
5373
5568
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5374
5569
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5411,12 +5606,10 @@ static bool llm_load_tensors(
|
|
5411
5606
|
// output
|
5412
5607
|
{
|
5413
5608
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5414
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5609
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5415
5610
|
// if output is NULL, init from the input tok embed
|
5416
5611
|
if (model.output == NULL) {
|
5417
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5418
|
-
ml.n_created--; // artificial tensor
|
5419
|
-
ml.size_data += ggml_nbytes(model.output);
|
5612
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5420
5613
|
}
|
5421
5614
|
}
|
5422
5615
|
|
@@ -5514,8 +5707,8 @@ static bool llm_load_tensors(
|
|
5514
5707
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5515
5708
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5516
5709
|
|
5517
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
5518
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5710
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5711
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5519
5712
|
|
5520
5713
|
if (layer.wqkv == nullptr) {
|
5521
5714
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
@@ -5552,17 +5745,20 @@ static bool llm_load_tensors(
|
|
5552
5745
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
5553
5746
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5554
5747
|
|
5555
|
-
auto& layer = model.layers[i];
|
5748
|
+
auto & layer = model.layers[i];
|
5556
5749
|
|
5557
5750
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5558
5751
|
|
5559
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
5560
|
-
layer.wo
|
5752
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5753
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5561
5754
|
|
5562
5755
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5563
5756
|
|
5564
5757
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5565
5758
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5759
|
+
|
5760
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5761
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5566
5762
|
}
|
5567
5763
|
} break;
|
5568
5764
|
case LLM_ARCH_PLAMO:
|
@@ -5731,9 +5927,7 @@ static bool llm_load_tensors(
|
|
5731
5927
|
|
5732
5928
|
// output
|
5733
5929
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5734
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
5735
|
-
ml.n_created--; // artificial tensor
|
5736
|
-
ml.size_data += ggml_nbytes(model.output);
|
5930
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
5737
5931
|
|
5738
5932
|
const int64_t n_ff = hparams.n_ff;
|
5739
5933
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -5768,12 +5962,10 @@ static bool llm_load_tensors(
|
|
5768
5962
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5769
5963
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5770
5964
|
|
5771
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5965
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5772
5966
|
// if output is NULL, init from the input tok embed
|
5773
5967
|
if (model.output == NULL) {
|
5774
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5775
|
-
ml.n_created--; // artificial tensor
|
5776
|
-
ml.size_data += ggml_nbytes(model.output);
|
5968
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5777
5969
|
}
|
5778
5970
|
|
5779
5971
|
}
|
@@ -5824,12 +6016,10 @@ static bool llm_load_tensors(
|
|
5824
6016
|
{
|
5825
6017
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5826
6018
|
|
5827
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6019
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5828
6020
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
5829
6021
|
if (model.output == NULL) {
|
5830
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5831
|
-
ml.n_created--; // artificial tensor
|
5832
|
-
ml.size_data += ggml_nbytes(model.output);
|
6022
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5833
6023
|
}
|
5834
6024
|
}
|
5835
6025
|
|
@@ -5890,9 +6080,7 @@ static bool llm_load_tensors(
|
|
5890
6080
|
{
|
5891
6081
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5892
6082
|
// init output from the input tok embed
|
5893
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5894
|
-
ml.n_created--; // artificial tensor
|
5895
|
-
ml.size_data += ggml_nbytes(model.output);
|
6083
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5896
6084
|
}
|
5897
6085
|
|
5898
6086
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5924,12 +6112,10 @@ static bool llm_load_tensors(
|
|
5924
6112
|
|
5925
6113
|
// output
|
5926
6114
|
{
|
5927
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6115
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5928
6116
|
// if output is NULL, init from the input tok embed
|
5929
6117
|
if (model.output == NULL) {
|
5930
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5931
|
-
ml.n_created--; // artificial tensor
|
5932
|
-
ml.size_data += ggml_nbytes(model.output);
|
6118
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5933
6119
|
}
|
5934
6120
|
}
|
5935
6121
|
|
@@ -5949,6 +6135,81 @@ static bool llm_load_tensors(
|
|
5949
6135
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5950
6136
|
}
|
5951
6137
|
} break;
|
6138
|
+
case LLM_ARCH_GPTNEOX:
|
6139
|
+
{
|
6140
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6141
|
+
// output
|
6142
|
+
{
|
6143
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6144
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
6145
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6146
|
+
}
|
6147
|
+
|
6148
|
+
for (int i = 0; i < n_layer; ++i) {
|
6149
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6150
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6151
|
+
|
6152
|
+
auto & layer = model.layers[i];
|
6153
|
+
|
6154
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6155
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
6156
|
+
|
6157
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
6158
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
6159
|
+
|
6160
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6161
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
6162
|
+
|
6163
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6164
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
6165
|
+
|
6166
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
6167
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
6168
|
+
|
6169
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
6171
|
+
}
|
6172
|
+
} break;
|
6173
|
+
case LLM_ARCH_ARCTIC:
|
6174
|
+
{
|
6175
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6176
|
+
|
6177
|
+
// output
|
6178
|
+
{
|
6179
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6180
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6181
|
+
// if output is NULL, init from the input tok embed
|
6182
|
+
if (model.output == NULL) {
|
6183
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6184
|
+
}
|
6185
|
+
}
|
6186
|
+
|
6187
|
+
for (int i = 0; i < n_layer; ++i) {
|
6188
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6189
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6190
|
+
|
6191
|
+
auto & layer = model.layers[i];
|
6192
|
+
|
6193
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6194
|
+
|
6195
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
6196
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
6197
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
6198
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6199
|
+
|
6200
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6201
|
+
|
6202
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
6203
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
6204
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
6205
|
+
|
6206
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6207
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
6208
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
6209
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
6210
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
|
+
}
|
6212
|
+
} break;
|
5952
6213
|
default:
|
5953
6214
|
throw std::runtime_error("unknown architecture");
|
5954
6215
|
}
|
@@ -6213,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6213
6474
|
|
6214
6475
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
6215
6476
|
} else {
|
6216
|
-
|
6217
|
-
GGML_ASSERT(false && "not implemented");
|
6218
|
-
#endif
|
6219
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6477
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6220
6478
|
inpL = lctx.inp_embd;
|
6221
6479
|
ggml_set_input(lctx.inp_embd);
|
6222
6480
|
}
|
@@ -6318,7 +6576,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
6318
6576
|
llm_ffn_gate_type type_gate,
|
6319
6577
|
const llm_build_cb & cb,
|
6320
6578
|
int il) {
|
6321
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
6579
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
6322
6580
|
cb(tmp, "ffn_up", il);
|
6323
6581
|
|
6324
6582
|
if (up_b) {
|
@@ -6500,7 +6758,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6500
6758
|
struct ggml_tensor * wo_b,
|
6501
6759
|
struct ggml_tensor * q_cur,
|
6502
6760
|
struct ggml_tensor * kq_mask,
|
6503
|
-
struct ggml_tensor * kq_pos,
|
6504
6761
|
int32_t n_tokens,
|
6505
6762
|
int32_t n_kv,
|
6506
6763
|
float kq_scale,
|
@@ -6512,6 +6769,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6512
6769
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
6513
6770
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6514
6771
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
6772
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6515
6773
|
|
6516
6774
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
6517
6775
|
cb(q, "q", il);
|
@@ -6530,31 +6788,27 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6530
6788
|
GGML_UNUSED(model);
|
6531
6789
|
GGML_UNUSED(n_ctx);
|
6532
6790
|
|
6533
|
-
// note: if this assert triggers, then some check has failed earlier
|
6534
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6535
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6536
|
-
|
6537
6791
|
// split cached v into n_head heads (not transposed)
|
6538
6792
|
struct ggml_tensor * v =
|
6539
6793
|
ggml_view_3d(ctx, kv.v_l[il],
|
6540
6794
|
n_embd_head_v, n_kv, n_head_kv,
|
6541
|
-
ggml_row_size(kv.v_l[il]->type,
|
6542
|
-
ggml_row_size(kv.v_l[il]->type,
|
6795
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
6796
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
6543
6797
|
0);
|
6544
6798
|
cb(v, "v", il);
|
6545
6799
|
|
6546
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6800
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6547
6801
|
|
6548
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6802
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6549
6803
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6550
6804
|
}
|
6551
6805
|
|
6552
|
-
cur = ggml_reshape_2d(ctx, cur,
|
6806
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
6553
6807
|
} else {
|
6554
6808
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6555
6809
|
cb(kq, "kq", il);
|
6556
6810
|
|
6557
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6811
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6558
6812
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6559
6813
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6560
6814
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6574,28 +6828,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6574
6828
|
kq = ggml_scale(ctx, kq, 30);
|
6575
6829
|
}
|
6576
6830
|
|
6577
|
-
|
6578
|
-
|
6579
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6580
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6581
|
-
if (hparams.use_alibi) {
|
6582
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
6583
|
-
cb(kq, "kq_scaled", il);
|
6584
|
-
|
6585
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6586
|
-
cb(kq, "kq_scaled_alibi", il);
|
6587
|
-
|
6588
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
6589
|
-
cb(kq, "kq_masked", il);
|
6590
|
-
|
6591
|
-
kq = ggml_soft_max(ctx, kq);
|
6592
|
-
cb(kq, "kq_soft_max", il);
|
6593
|
-
} else
|
6594
|
-
#endif
|
6595
|
-
{
|
6596
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6597
|
-
cb(kq, "kq_soft_max_ext", il);
|
6598
|
-
}
|
6831
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6832
|
+
cb(kq, "kq_soft_max_ext", il);
|
6599
6833
|
|
6600
6834
|
GGML_ASSERT(kv.size == n_ctx);
|
6601
6835
|
|
@@ -6614,7 +6848,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6614
6848
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6615
6849
|
cb(kqv_merged, "kqv_merged", il);
|
6616
6850
|
|
6617
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
6851
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
6618
6852
|
cb(cur, "kqv_merged_cont", il);
|
6619
6853
|
}
|
6620
6854
|
|
@@ -6645,7 +6879,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6645
6879
|
struct ggml_tensor * v_cur,
|
6646
6880
|
struct ggml_tensor * q_cur,
|
6647
6881
|
struct ggml_tensor * kq_mask,
|
6648
|
-
struct ggml_tensor * kq_pos,
|
6649
6882
|
int32_t n_tokens,
|
6650
6883
|
int32_t kv_head,
|
6651
6884
|
int32_t n_kv,
|
@@ -6664,7 +6897,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6664
6897
|
struct ggml_tensor * cur;
|
6665
6898
|
|
6666
6899
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6667
|
-
q_cur, kq_mask,
|
6900
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
6668
6901
|
cb(cur, "kqv_out", il);
|
6669
6902
|
|
6670
6903
|
return cur;
|
@@ -6771,18 +7004,17 @@ struct llm_build_context {
|
|
6771
7004
|
|
6772
7005
|
ctx0 = ggml_init(params);
|
6773
7006
|
|
6774
|
-
lctx.inp_tokens
|
6775
|
-
lctx.inp_embd
|
6776
|
-
lctx.inp_pos
|
7007
|
+
lctx.inp_tokens = nullptr;
|
7008
|
+
lctx.inp_embd = nullptr;
|
7009
|
+
lctx.inp_pos = nullptr;
|
6777
7010
|
lctx.inp_out_ids = nullptr;
|
6778
7011
|
lctx.inp_KQ_mask = nullptr;
|
6779
|
-
lctx.inp_KQ_pos = nullptr;
|
6780
7012
|
lctx.inp_K_shift = nullptr;
|
6781
|
-
lctx.inp_mean
|
6782
|
-
lctx.inp_cls
|
6783
|
-
lctx.inp_s_copy
|
6784
|
-
lctx.inp_s_mask
|
6785
|
-
lctx.inp_s_seq
|
7013
|
+
lctx.inp_mean = nullptr;
|
7014
|
+
lctx.inp_cls = nullptr;
|
7015
|
+
lctx.inp_s_copy = nullptr;
|
7016
|
+
lctx.inp_s_mask = nullptr;
|
7017
|
+
lctx.inp_s_seq = nullptr;
|
6786
7018
|
}
|
6787
7019
|
|
6788
7020
|
void free() {
|
@@ -6801,17 +7033,20 @@ struct llm_build_context {
|
|
6801
7033
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
6802
7034
|
ggml_set_input(lctx.inp_K_shift);
|
6803
7035
|
|
7036
|
+
|
6804
7037
|
for (int il = 0; il < n_layer; ++il) {
|
7038
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
6805
7039
|
struct ggml_tensor * tmp =
|
6806
7040
|
// we rotate only the first n_rot dimensions
|
6807
|
-
|
7041
|
+
ggml_rope_ext_inplace(ctx0,
|
6808
7042
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
6809
7043
|
n_embd_head_k, n_head_kv, n_ctx,
|
6810
7044
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
6811
7045
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6812
7046
|
0),
|
6813
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7047
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6814
7048
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7049
|
+
|
6815
7050
|
cb(tmp, "K_shifted", il);
|
6816
7051
|
ggml_build_forward_expand(gf, tmp);
|
6817
7052
|
}
|
@@ -6914,6 +7149,17 @@ struct llm_build_context {
|
|
6914
7149
|
return lctx.inp_pos;
|
6915
7150
|
}
|
6916
7151
|
|
7152
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
7153
|
+
// choose long/short freq factors based on the context size
|
7154
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7155
|
+
|
7156
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
7157
|
+
return model.layers[il].rope_long;
|
7158
|
+
}
|
7159
|
+
|
7160
|
+
return model.layers[il].rope_short;
|
7161
|
+
}
|
7162
|
+
|
6917
7163
|
struct ggml_tensor * build_inp_out_ids() {
|
6918
7164
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6919
7165
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
@@ -6932,19 +7178,6 @@ struct llm_build_context {
|
|
6932
7178
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6933
7179
|
}
|
6934
7180
|
|
6935
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6936
|
-
if (causal) {
|
6937
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6938
|
-
} else {
|
6939
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
6940
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6941
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6942
|
-
}
|
6943
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6944
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
6945
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6946
|
-
}
|
6947
|
-
|
6948
7181
|
struct ggml_tensor * build_inp_mean() {
|
6949
7182
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6950
7183
|
cb(lctx.inp_mean, "inp_mean", -1);
|
@@ -7034,15 +7267,15 @@ struct llm_build_context {
|
|
7034
7267
|
cb(Vcur, "Vcur", il);
|
7035
7268
|
}
|
7036
7269
|
|
7037
|
-
Qcur =
|
7038
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7270
|
+
Qcur = ggml_rope_ext(
|
7271
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7039
7272
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7040
7273
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7041
7274
|
);
|
7042
7275
|
cb(Qcur, "Qcur", il);
|
7043
7276
|
|
7044
|
-
Kcur =
|
7045
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7277
|
+
Kcur = ggml_rope_ext(
|
7278
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7046
7279
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7047
7280
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7048
7281
|
);
|
@@ -7050,7 +7283,7 @@ struct llm_build_context {
|
|
7050
7283
|
|
7051
7284
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7052
7285
|
model.layers[il].wo, model.layers[il].bo,
|
7053
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7286
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7054
7287
|
}
|
7055
7288
|
|
7056
7289
|
if (il == n_layer - 1) {
|
@@ -7143,9 +7376,6 @@ struct llm_build_context {
|
|
7143
7376
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7144
7377
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7145
7378
|
|
7146
|
-
// positions of the tokens in the KV cache
|
7147
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7148
|
-
|
7149
7379
|
for (int il = 0; il < n_layer; ++il) {
|
7150
7380
|
struct ggml_tensor * inpSA = inpL;
|
7151
7381
|
|
@@ -7167,13 +7397,13 @@ struct llm_build_context {
|
|
7167
7397
|
|
7168
7398
|
switch (model.type) {
|
7169
7399
|
case MODEL_7B:
|
7170
|
-
Qcur =
|
7171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
Qcur = ggml_rope_ext(
|
7401
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7172
7402
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7173
7403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7174
7404
|
);
|
7175
|
-
Kcur =
|
7176
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7405
|
+
Kcur = ggml_rope_ext(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7177
7407
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7178
7408
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7179
7409
|
);
|
@@ -7190,7 +7420,7 @@ struct llm_build_context {
|
|
7190
7420
|
|
7191
7421
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7192
7422
|
model.layers[il].wo, NULL,
|
7193
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7423
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7194
7424
|
}
|
7195
7425
|
|
7196
7426
|
if (il == n_layer - 1) {
|
@@ -7260,9 +7490,6 @@ struct llm_build_context {
|
|
7260
7490
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7261
7491
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7262
7492
|
|
7263
|
-
// positions of the tokens in the KV cache
|
7264
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7265
|
-
|
7266
7493
|
for (int il = 0; il < n_layer; ++il) {
|
7267
7494
|
struct ggml_tensor * inpSA = inpL;
|
7268
7495
|
|
@@ -7282,22 +7509,22 @@ struct llm_build_context {
|
|
7282
7509
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7283
7510
|
cb(Vcur, "Vcur", il);
|
7284
7511
|
|
7285
|
-
Qcur =
|
7286
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7512
|
+
Qcur = ggml_rope_ext(
|
7513
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7287
7514
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7288
7515
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7289
7516
|
);
|
7290
7517
|
cb(Qcur, "Qcur", il);
|
7291
7518
|
|
7292
|
-
Kcur =
|
7293
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7519
|
+
Kcur = ggml_rope_ext(
|
7520
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7294
7521
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7295
7522
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7296
7523
|
);
|
7297
7524
|
cb(Kcur, "Kcur", il);
|
7298
7525
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7299
7526
|
model.layers[il].wo, NULL,
|
7300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7527
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7301
7528
|
}
|
7302
7529
|
|
7303
7530
|
if (il == n_layer - 1) {
|
@@ -7403,21 +7630,21 @@ struct llm_build_context {
|
|
7403
7630
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7404
7631
|
|
7405
7632
|
// using mode = 2 for neox mode
|
7406
|
-
Qcur =
|
7407
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7633
|
+
Qcur = ggml_rope_ext(
|
7634
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7408
7635
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7409
7636
|
);
|
7410
7637
|
cb(Qcur, "Qcur", il);
|
7411
7638
|
|
7412
|
-
Kcur =
|
7413
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7639
|
+
Kcur = ggml_rope_ext(
|
7640
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7414
7641
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7415
7642
|
);
|
7416
7643
|
cb(Kcur, "Kcur", il);
|
7417
7644
|
|
7418
7645
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7419
7646
|
model.layers[il].wo, NULL,
|
7420
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7647
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7421
7648
|
}
|
7422
7649
|
|
7423
7650
|
if (il == n_layer - 1) {
|
@@ -7526,15 +7753,15 @@ struct llm_build_context {
|
|
7526
7753
|
cb(Vcur, "Vcur", il);
|
7527
7754
|
}
|
7528
7755
|
|
7529
|
-
Qcur =
|
7530
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7756
|
+
Qcur = ggml_rope_ext(
|
7757
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7531
7758
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7532
7759
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7533
7760
|
);
|
7534
7761
|
cb(Qcur, "Qcur", il);
|
7535
7762
|
|
7536
|
-
Kcur =
|
7537
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7763
|
+
Kcur = ggml_rope_ext(
|
7764
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7538
7765
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7539
7766
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7540
7767
|
);
|
@@ -7542,7 +7769,7 @@ struct llm_build_context {
|
|
7542
7769
|
|
7543
7770
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7544
7771
|
model.layers[il].wo, model.layers[il].bo,
|
7545
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7772
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7546
7773
|
}
|
7547
7774
|
|
7548
7775
|
if (il == n_layer - 1) {
|
@@ -7678,15 +7905,15 @@ struct llm_build_context {
|
|
7678
7905
|
cb(Kcur, "Kcur", il);
|
7679
7906
|
cb(Vcur, "Vcur", il);
|
7680
7907
|
|
7681
|
-
Qcur =
|
7682
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7908
|
+
Qcur = ggml_rope_ext(
|
7909
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7683
7910
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7684
7911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7685
7912
|
);
|
7686
7913
|
cb(Qcur, "Qcur", il);
|
7687
7914
|
|
7688
|
-
Kcur =
|
7689
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7915
|
+
Kcur = ggml_rope_ext(
|
7916
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7690
7917
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7691
7918
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7692
7919
|
);
|
@@ -7694,7 +7921,7 @@ struct llm_build_context {
|
|
7694
7921
|
|
7695
7922
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7696
7923
|
model.layers[il].wo, NULL,
|
7697
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7924
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7698
7925
|
}
|
7699
7926
|
|
7700
7927
|
if (il == n_layer - 1) {
|
@@ -7806,7 +8033,7 @@ struct llm_build_context {
|
|
7806
8033
|
|
7807
8034
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7808
8035
|
model.layers[il].wo, model.layers[il].bo,
|
7809
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8036
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7810
8037
|
}
|
7811
8038
|
|
7812
8039
|
if (il == n_layer - 1) {
|
@@ -7855,266 +8082,56 @@ struct llm_build_context {
|
|
7855
8082
|
return gf;
|
7856
8083
|
}
|
7857
8084
|
|
7858
|
-
struct ggml_cgraph *
|
8085
|
+
struct ggml_cgraph * build_refact() {
|
7859
8086
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7860
8087
|
|
7861
8088
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7862
|
-
GGML_ASSERT(n_embd_head
|
7863
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
8089
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7864
8090
|
|
7865
8091
|
struct ggml_tensor * cur;
|
7866
8092
|
struct ggml_tensor * inpL;
|
7867
8093
|
|
7868
8094
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7869
8095
|
|
7870
|
-
// inp_pos - contains the positions
|
7871
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
7872
|
-
|
7873
8096
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7874
8097
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7875
8098
|
|
7876
8099
|
for (int il = 0; il < n_layer; ++il) {
|
7877
|
-
struct ggml_tensor *
|
8100
|
+
struct ggml_tensor * inpSA = inpL;
|
7878
8101
|
|
7879
8102
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7880
|
-
model.layers[il].attn_norm,
|
7881
|
-
|
7882
|
-
LLM_NORM, cb, il);
|
8103
|
+
model.layers[il].attn_norm, NULL,
|
8104
|
+
LLM_NORM_RMS, cb, il);
|
7883
8105
|
cb(cur, "attn_norm", il);
|
7884
8106
|
|
7885
|
-
// self
|
8107
|
+
// self-attention
|
7886
8108
|
{
|
7887
|
-
|
7888
|
-
cb(
|
8109
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8110
|
+
cb(Qcur, "Qcur", il);
|
7889
8111
|
|
7890
|
-
|
7891
|
-
cb(
|
8112
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8113
|
+
cb(Kcur, "Kcur", il);
|
7892
8114
|
|
7893
|
-
|
7894
|
-
|
8115
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8116
|
+
cb(Vcur, "Vcur", il);
|
7895
8117
|
|
7896
|
-
|
7897
|
-
cb(
|
8118
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8119
|
+
cb(Kcur, "Kcur", il);
|
7898
8120
|
|
7899
|
-
|
7900
|
-
cb(
|
8121
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8122
|
+
cb(Qcur, "Qcur", il);
|
7901
8123
|
|
7902
|
-
|
7903
|
-
|
7904
|
-
|
7905
|
-
|
7906
|
-
0
|
7907
|
-
);
|
7908
|
-
cb(tmpq, "tmpq", il);
|
8124
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8125
|
+
model.layers[il].wo, NULL,
|
8126
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8127
|
+
}
|
7909
8128
|
|
7910
|
-
|
7911
|
-
|
7912
|
-
|
7913
|
-
|
7914
|
-
|
7915
|
-
|
7916
|
-
cb(tmpk, "tmpk", il);
|
7917
|
-
|
7918
|
-
// Q/K Layernorm
|
7919
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
7920
|
-
model.layers[il].attn_q_norm,
|
7921
|
-
model.layers[il].attn_q_norm_b,
|
7922
|
-
LLM_NORM, cb, il);
|
7923
|
-
cb(tmpq, "tmpq", il);
|
7924
|
-
|
7925
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
7926
|
-
model.layers[il].attn_k_norm,
|
7927
|
-
model.layers[il].attn_k_norm_b,
|
7928
|
-
LLM_NORM, cb, il);
|
7929
|
-
cb(tmpk, "tmpk", il);
|
7930
|
-
|
7931
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
7932
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
7933
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
7934
|
-
ggml_element_size(tmpq) * n_embd_head,
|
7935
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
7936
|
-
0
|
7937
|
-
);
|
7938
|
-
cb(qrot, "qrot", il);
|
7939
|
-
|
7940
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
7941
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
7942
|
-
ggml_element_size(tmpk) * n_embd_head,
|
7943
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
7944
|
-
0
|
7945
|
-
);
|
7946
|
-
cb(krot, "krot", il);
|
7947
|
-
|
7948
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
7949
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
7950
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
7951
|
-
ggml_element_size(tmpq) * n_embd_head,
|
7952
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
7953
|
-
ggml_element_size(tmpq) * n_rot
|
7954
|
-
);
|
7955
|
-
cb(qpass, "qpass", il);
|
7956
|
-
|
7957
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
7958
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
7959
|
-
ggml_element_size(tmpk) * n_embd_head,
|
7960
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
7961
|
-
ggml_element_size(tmpk) * n_rot
|
7962
|
-
);
|
7963
|
-
cb(kpass, "kpass", il);
|
7964
|
-
|
7965
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
7966
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7967
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7968
|
-
);
|
7969
|
-
cb(qrotated, "qrotated", il);
|
7970
|
-
|
7971
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
7972
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7973
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7974
|
-
);
|
7975
|
-
cb(krotated, "krotated", il);
|
7976
|
-
|
7977
|
-
// ggml currently only supports concatenation on dim=2
|
7978
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
7979
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
7980
|
-
cb(qrotated, "qrotated", il);
|
7981
|
-
|
7982
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
7983
|
-
cb(krotated, "krotated", il);
|
7984
|
-
|
7985
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
7986
|
-
cb(qpass, "qpass", il);
|
7987
|
-
|
7988
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
7989
|
-
cb(kpass, "kpass", il);
|
7990
|
-
|
7991
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
7992
|
-
cb(Qcur, "Qcur", il);
|
7993
|
-
|
7994
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
7995
|
-
cb(Kcur, "Kcur", il);
|
7996
|
-
|
7997
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
7998
|
-
cb(Q, "Q", il);
|
7999
|
-
|
8000
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
8001
|
-
cb(Kcur, "Kcur", il);
|
8002
|
-
|
8003
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
8004
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
8005
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
8006
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
8007
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
8008
|
-
);
|
8009
|
-
cb(Vcur, "Vcur", il);
|
8010
|
-
|
8011
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8012
|
-
model.layers[il].wo, model.layers[il].bo,
|
8013
|
-
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8014
|
-
}
|
8015
|
-
|
8016
|
-
if (il == n_layer - 1) {
|
8017
|
-
// skip computing output for unused tokens
|
8018
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8019
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8020
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8021
|
-
}
|
8022
|
-
|
8023
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
8024
|
-
cb(ffn_inp, "ffn_inp", il);
|
8025
|
-
|
8026
|
-
// feed-forward network
|
8027
|
-
{
|
8028
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8029
|
-
model.layers[il].ffn_norm,
|
8030
|
-
model.layers[il].ffn_norm_b,
|
8031
|
-
LLM_NORM, cb, il);
|
8032
|
-
cb(cur, "ffn_norm", il);
|
8033
|
-
|
8034
|
-
cur = llm_build_ffn(ctx0, cur,
|
8035
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8036
|
-
NULL, NULL,
|
8037
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8038
|
-
NULL,
|
8039
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
8040
|
-
cb(cur, "ffn_out", il);
|
8041
|
-
}
|
8042
|
-
|
8043
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
8044
|
-
cb(cur, "l_out", il);
|
8045
|
-
|
8046
|
-
inpL = cur;
|
8047
|
-
}
|
8048
|
-
|
8049
|
-
cur = inpL;
|
8050
|
-
|
8051
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
8052
|
-
model.output_norm,
|
8053
|
-
model.output_norm_b,
|
8054
|
-
LLM_NORM, cb, -1);
|
8055
|
-
cb(cur, "result_norm", -1);
|
8056
|
-
|
8057
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8058
|
-
cb(cur, "result_output", -1);
|
8059
|
-
|
8060
|
-
ggml_build_forward_expand(gf, cur);
|
8061
|
-
|
8062
|
-
return gf;
|
8063
|
-
}
|
8064
|
-
|
8065
|
-
struct ggml_cgraph * build_refact() {
|
8066
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8067
|
-
|
8068
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8069
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8070
|
-
|
8071
|
-
struct ggml_tensor * cur;
|
8072
|
-
struct ggml_tensor * inpL;
|
8073
|
-
|
8074
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8075
|
-
|
8076
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8077
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8078
|
-
|
8079
|
-
// positions of the tokens in the KV cache
|
8080
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8081
|
-
|
8082
|
-
for (int il = 0; il < n_layer; ++il) {
|
8083
|
-
struct ggml_tensor * inpSA = inpL;
|
8084
|
-
|
8085
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
8086
|
-
model.layers[il].attn_norm, NULL,
|
8087
|
-
LLM_NORM_RMS, cb, il);
|
8088
|
-
cb(cur, "attn_norm", il);
|
8089
|
-
|
8090
|
-
// self-attention
|
8091
|
-
{
|
8092
|
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8093
|
-
cb(Qcur, "Qcur", il);
|
8094
|
-
|
8095
|
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8096
|
-
cb(Kcur, "Kcur", il);
|
8097
|
-
|
8098
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8099
|
-
cb(Vcur, "Vcur", il);
|
8100
|
-
|
8101
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8102
|
-
cb(Kcur, "Kcur", il);
|
8103
|
-
|
8104
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8105
|
-
cb(Qcur, "Qcur", il);
|
8106
|
-
|
8107
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8108
|
-
model.layers[il].wo, NULL,
|
8109
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8110
|
-
}
|
8111
|
-
|
8112
|
-
if (il == n_layer - 1) {
|
8113
|
-
// skip computing output for unused tokens
|
8114
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8115
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8116
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8117
|
-
}
|
8129
|
+
if (il == n_layer - 1) {
|
8130
|
+
// skip computing output for unused tokens
|
8131
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8132
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8133
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8134
|
+
}
|
8118
8135
|
|
8119
8136
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8120
8137
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -8168,8 +8185,11 @@ struct llm_build_context {
|
|
8168
8185
|
|
8169
8186
|
struct ggml_tensor * cur;
|
8170
8187
|
struct ggml_tensor * inpL;
|
8188
|
+
struct ggml_tensor * inp_pos = nullptr;
|
8171
8189
|
|
8172
|
-
|
8190
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8191
|
+
inp_pos = build_inp_pos();
|
8192
|
+
}
|
8173
8193
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
8174
8194
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
8175
8195
|
|
@@ -8200,13 +8220,26 @@ struct llm_build_context {
|
|
8200
8220
|
struct ggml_tensor * Vcur;
|
8201
8221
|
|
8202
8222
|
// self-attention
|
8203
|
-
if (model.arch == LLM_ARCH_BERT) {
|
8223
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8204
8224
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
8205
8225
|
cb(Qcur, "Qcur", il);
|
8206
8226
|
|
8227
|
+
if (model.layers[il].attn_q_norm) {
|
8228
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8229
|
+
model.layers[il].attn_q_norm,
|
8230
|
+
model.layers[il].attn_q_norm_b,
|
8231
|
+
LLM_NORM, cb, il);
|
8232
|
+
}
|
8233
|
+
|
8207
8234
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
8208
8235
|
cb(Kcur, "Kcur", il);
|
8209
8236
|
|
8237
|
+
if (model.layers[il].attn_k_norm) {
|
8238
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8239
|
+
model.layers[il].attn_k_norm,
|
8240
|
+
model.layers[il].attn_k_norm_b,
|
8241
|
+
LLM_NORM, cb, il);
|
8242
|
+
}
|
8210
8243
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
8211
8244
|
cb(Vcur, "Vcur", il);
|
8212
8245
|
|
@@ -8225,15 +8258,15 @@ struct llm_build_context {
|
|
8225
8258
|
cb(Kcur, "Kcur", il);
|
8226
8259
|
cb(Vcur, "Vcur", il);
|
8227
8260
|
|
8228
|
-
Qcur =
|
8229
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8261
|
+
Qcur = ggml_rope_ext(
|
8262
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8230
8263
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8231
8264
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8232
8265
|
);
|
8233
8266
|
cb(Qcur, "Qcur", il);
|
8234
8267
|
|
8235
|
-
Kcur =
|
8236
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8268
|
+
Kcur = ggml_rope_ext(
|
8269
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8237
8270
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8238
8271
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8239
8272
|
);
|
@@ -8246,7 +8279,7 @@ struct llm_build_context {
|
|
8246
8279
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
8247
8280
|
cb(kq, "kq", il);
|
8248
8281
|
|
8249
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
8282
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
8250
8283
|
cb(kq, "kq_soft_max_ext", il);
|
8251
8284
|
|
8252
8285
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
@@ -8297,6 +8330,13 @@ struct llm_build_context {
|
|
8297
8330
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8298
8331
|
NULL,
|
8299
8332
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8333
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8334
|
+
cur = llm_build_ffn(ctx0, cur,
|
8335
|
+
model.layers[il].ffn_up, NULL,
|
8336
|
+
model.layers[il].ffn_gate, NULL,
|
8337
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8338
|
+
NULL,
|
8339
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
8300
8340
|
} else {
|
8301
8341
|
cur = llm_build_ffn(ctx0, cur,
|
8302
8342
|
model.layers[il].ffn_up, NULL,
|
@@ -8363,9 +8403,6 @@ struct llm_build_context {
|
|
8363
8403
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8364
8404
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8365
8405
|
|
8366
|
-
// positions of the tokens in the KV cache
|
8367
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8368
|
-
|
8369
8406
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
8370
8407
|
model.tok_norm,
|
8371
8408
|
model.tok_norm_b,
|
@@ -8399,7 +8436,7 @@ struct llm_build_context {
|
|
8399
8436
|
|
8400
8437
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8401
8438
|
model.layers[il].wo, model.layers[il].bo,
|
8402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8439
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8403
8440
|
}
|
8404
8441
|
|
8405
8442
|
if (il == n_layer - 1) {
|
@@ -8464,9 +8501,6 @@ struct llm_build_context {
|
|
8464
8501
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8465
8502
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8466
8503
|
|
8467
|
-
// positions of the tokens in the KV cache
|
8468
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8469
|
-
|
8470
8504
|
if (model.pos_embd) {
|
8471
8505
|
// inp_pos - contains the positions
|
8472
8506
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
@@ -8530,13 +8564,13 @@ struct llm_build_context {
|
|
8530
8564
|
|
8531
8565
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8532
8566
|
model.layers[il].wo, model.layers[il].bo,
|
8533
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8567
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8534
8568
|
} else {
|
8535
8569
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8536
8570
|
|
8537
8571
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8538
8572
|
model.layers[il].wo, model.layers[il].bo,
|
8539
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8573
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8540
8574
|
}
|
8541
8575
|
}
|
8542
8576
|
|
@@ -8664,15 +8698,15 @@ struct llm_build_context {
|
|
8664
8698
|
}
|
8665
8699
|
|
8666
8700
|
|
8667
|
-
Qcur =
|
8668
|
-
ctx0, Qcur, inp_pos,
|
8701
|
+
Qcur = ggml_rope_ext(
|
8702
|
+
ctx0, Qcur, inp_pos, nullptr,
|
8669
8703
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8670
8704
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8671
8705
|
);
|
8672
8706
|
cb(Qcur, "Qcur", il);
|
8673
8707
|
|
8674
|
-
Kcur =
|
8675
|
-
ctx0, Kcur, inp_pos,
|
8708
|
+
Kcur = ggml_rope_ext(
|
8709
|
+
ctx0, Kcur, inp_pos, nullptr,
|
8676
8710
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8677
8711
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8678
8712
|
);
|
@@ -8680,7 +8714,7 @@ struct llm_build_context {
|
|
8680
8714
|
|
8681
8715
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8682
8716
|
model.layers[il].wo, NULL,
|
8683
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8717
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8684
8718
|
}
|
8685
8719
|
|
8686
8720
|
if (il == n_layer - 1) {
|
@@ -8784,21 +8818,21 @@ struct llm_build_context {
|
|
8784
8818
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8785
8819
|
|
8786
8820
|
// using mode = 2 for neox mode
|
8787
|
-
Qcur =
|
8788
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8821
|
+
Qcur = ggml_rope_ext(
|
8822
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8789
8823
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8790
8824
|
);
|
8791
8825
|
cb(Qcur, "Qcur", il);
|
8792
8826
|
|
8793
|
-
Kcur =
|
8794
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8827
|
+
Kcur = ggml_rope_ext(
|
8828
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8795
8829
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8796
8830
|
);
|
8797
8831
|
cb(Kcur, "Kcur", il);
|
8798
8832
|
|
8799
8833
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8800
8834
|
model.layers[il].wo, NULL,
|
8801
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8835
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8802
8836
|
}
|
8803
8837
|
|
8804
8838
|
if (il == n_layer - 1) {
|
@@ -8895,15 +8929,15 @@ struct llm_build_context {
|
|
8895
8929
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8896
8930
|
cb(Vcur, "Vcur", il);
|
8897
8931
|
|
8898
|
-
Qcur =
|
8899
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8932
|
+
Qcur = ggml_rope_ext(
|
8933
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8900
8934
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8901
8935
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8902
8936
|
);
|
8903
8937
|
cb(Qcur, "Qcur", il);
|
8904
8938
|
|
8905
|
-
Kcur =
|
8906
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8939
|
+
Kcur = ggml_rope_ext(
|
8940
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8907
8941
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8908
8942
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8909
8943
|
);
|
@@ -8911,7 +8945,7 @@ struct llm_build_context {
|
|
8911
8945
|
|
8912
8946
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8913
8947
|
model.layers[il].wo, model.layers[il].bo,
|
8914
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8948
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8915
8949
|
}
|
8916
8950
|
|
8917
8951
|
if (il == n_layer - 1) {
|
@@ -9009,15 +9043,15 @@ struct llm_build_context {
|
|
9009
9043
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9010
9044
|
cb(Vcur, "Vcur", il);
|
9011
9045
|
|
9012
|
-
Qcur =
|
9013
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9046
|
+
Qcur = ggml_rope_ext(
|
9047
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9014
9048
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9015
9049
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9016
9050
|
);
|
9017
9051
|
cb(Qcur, "Qcur", il);
|
9018
9052
|
|
9019
|
-
Kcur =
|
9020
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9053
|
+
Kcur = ggml_rope_ext(
|
9054
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9021
9055
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9022
9056
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9023
9057
|
);
|
@@ -9025,7 +9059,7 @@ struct llm_build_context {
|
|
9025
9059
|
|
9026
9060
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9027
9061
|
model.layers[il].wo, model.layers[il].bo,
|
9028
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9062
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9029
9063
|
}
|
9030
9064
|
|
9031
9065
|
if (il == n_layer - 1) {
|
@@ -9161,8 +9195,8 @@ struct llm_build_context {
|
|
9161
9195
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9162
9196
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9163
9197
|
|
9164
|
-
Qcur =
|
9165
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9198
|
+
Qcur = ggml_rope_ext(
|
9199
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9166
9200
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9167
9201
|
);
|
9168
9202
|
cb(Qcur, "Qcur", il);
|
@@ -9172,15 +9206,15 @@ struct llm_build_context {
|
|
9172
9206
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9173
9207
|
cb(Qcur, "Qcur", il);
|
9174
9208
|
|
9175
|
-
Kcur =
|
9176
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9209
|
+
Kcur = ggml_rope_ext(
|
9210
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9177
9211
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9178
9212
|
);
|
9179
9213
|
cb(Kcur, "Kcur", il);
|
9180
9214
|
|
9181
9215
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9182
9216
|
model.layers[il].wo, model.layers[il].bo,
|
9183
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9217
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9184
9218
|
}
|
9185
9219
|
|
9186
9220
|
if (il == n_layer - 1) {
|
@@ -9249,6 +9283,9 @@ struct llm_build_context {
|
|
9249
9283
|
|
9250
9284
|
// self-attention
|
9251
9285
|
{
|
9286
|
+
// rope freq factors for 128k context
|
9287
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
9288
|
+
|
9252
9289
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9253
9290
|
model.layers[il].attn_norm,
|
9254
9291
|
NULL,
|
@@ -9280,8 +9317,8 @@ struct llm_build_context {
|
|
9280
9317
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9281
9318
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9282
9319
|
|
9283
|
-
Qcur =
|
9284
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9320
|
+
Qcur = ggml_rope_ext(
|
9321
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9285
9322
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9286
9323
|
);
|
9287
9324
|
cb(Qcur, "Qcur", il);
|
@@ -9289,15 +9326,15 @@ struct llm_build_context {
|
|
9289
9326
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9290
9327
|
cb(Qcur, "Qcur", il);
|
9291
9328
|
|
9292
|
-
Kcur =
|
9293
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9329
|
+
Kcur = ggml_rope_ext(
|
9330
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9294
9331
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9295
9332
|
);
|
9296
9333
|
cb(Kcur, "Kcur", il);
|
9297
9334
|
|
9298
9335
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9299
9336
|
model.layers[il].wo, model.layers[il].bo,
|
9300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9337
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9301
9338
|
}
|
9302
9339
|
|
9303
9340
|
if (il == n_layer - 1) {
|
@@ -9396,21 +9433,21 @@ struct llm_build_context {
|
|
9396
9433
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9397
9434
|
cb(Vcur, "Vcur", il);
|
9398
9435
|
|
9399
|
-
Qcur =
|
9400
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
9436
|
+
Qcur = ggml_rope_ext(
|
9437
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9401
9438
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9402
9439
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9403
9440
|
cb(Qcur, "Qcur", il);
|
9404
9441
|
|
9405
|
-
Kcur =
|
9406
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
9442
|
+
Kcur = ggml_rope_ext(
|
9443
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9407
9444
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9408
9445
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9409
9446
|
cb(Kcur, "Kcur", il);
|
9410
9447
|
|
9411
9448
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9412
9449
|
model.layers[il].wo, NULL,
|
9413
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9450
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9414
9451
|
}
|
9415
9452
|
struct ggml_tensor * sa_out = cur;
|
9416
9453
|
|
@@ -9513,7 +9550,7 @@ struct llm_build_context {
|
|
9513
9550
|
|
9514
9551
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9515
9552
|
model.layers[il].wo, model.layers[il].bo,
|
9516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9553
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9517
9554
|
}
|
9518
9555
|
|
9519
9556
|
if (il == n_layer - 1) {
|
@@ -9604,15 +9641,15 @@ struct llm_build_context {
|
|
9604
9641
|
cb(tmpk, "tmpk", il);
|
9605
9642
|
cb(Vcur, "Vcur", il);
|
9606
9643
|
|
9607
|
-
struct ggml_tensor * Qcur =
|
9608
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
9644
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9645
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9609
9646
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9610
9647
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9611
9648
|
);
|
9612
9649
|
cb(Qcur, "Qcur", il);
|
9613
9650
|
|
9614
|
-
struct ggml_tensor * Kcur =
|
9615
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9651
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9652
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9616
9653
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9617
9654
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9618
9655
|
);
|
@@ -9620,7 +9657,7 @@ struct llm_build_context {
|
|
9620
9657
|
|
9621
9658
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9622
9659
|
model.layers[il].wo, model.layers[il].bo,
|
9623
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9660
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9624
9661
|
}
|
9625
9662
|
|
9626
9663
|
if (il == n_layer - 1) {
|
@@ -9720,15 +9757,15 @@ struct llm_build_context {
|
|
9720
9757
|
// cb(Vcur, "Vcur", il);
|
9721
9758
|
// }
|
9722
9759
|
|
9723
|
-
Qcur =
|
9724
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9760
|
+
Qcur = ggml_rope_ext(
|
9761
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9725
9762
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9726
9763
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9727
9764
|
);
|
9728
9765
|
cb(Qcur, "Qcur", il);
|
9729
9766
|
|
9730
|
-
Kcur =
|
9731
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9767
|
+
Kcur = ggml_rope_ext(
|
9768
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9732
9769
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9733
9770
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9734
9771
|
);
|
@@ -9736,7 +9773,7 @@ struct llm_build_context {
|
|
9736
9773
|
|
9737
9774
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9738
9775
|
model.layers[il].wo, NULL,
|
9739
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9776
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9740
9777
|
}
|
9741
9778
|
|
9742
9779
|
if (il == n_layer - 1) {
|
@@ -9837,15 +9874,15 @@ struct llm_build_context {
|
|
9837
9874
|
cb(Vcur, "Vcur", il);
|
9838
9875
|
}
|
9839
9876
|
|
9840
|
-
Qcur =
|
9841
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9877
|
+
Qcur = ggml_rope_ext(
|
9878
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9842
9879
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9843
9880
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9844
9881
|
);
|
9845
9882
|
cb(Qcur, "Qcur", il);
|
9846
9883
|
|
9847
|
-
Kcur =
|
9848
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9884
|
+
Kcur = ggml_rope_ext(
|
9885
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9849
9886
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9850
9887
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9851
9888
|
);
|
@@ -9853,7 +9890,7 @@ struct llm_build_context {
|
|
9853
9890
|
|
9854
9891
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9855
9892
|
model.layers[il].wo, model.layers[il].bo,
|
9856
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9893
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9857
9894
|
}
|
9858
9895
|
|
9859
9896
|
if (il == n_layer - 1) {
|
@@ -9967,15 +10004,15 @@ struct llm_build_context {
|
|
9967
10004
|
cb(Vcur, "Vcur", il);
|
9968
10005
|
}
|
9969
10006
|
|
9970
|
-
Qcur =
|
9971
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10007
|
+
Qcur = ggml_rope_ext(
|
10008
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9972
10009
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9973
10010
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9974
10011
|
);
|
9975
10012
|
cb(Qcur, "Qcur", il);
|
9976
10013
|
|
9977
|
-
Kcur =
|
9978
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10014
|
+
Kcur = ggml_rope_ext(
|
10015
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9979
10016
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9980
10017
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9981
10018
|
);
|
@@ -9983,7 +10020,7 @@ struct llm_build_context {
|
|
9983
10020
|
|
9984
10021
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9985
10022
|
model.layers[il].wo, model.layers[il].bo,
|
9986
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10023
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9987
10024
|
}
|
9988
10025
|
|
9989
10026
|
if (il == n_layer - 1) {
|
@@ -10087,8 +10124,8 @@ struct llm_build_context {
|
|
10087
10124
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10088
10125
|
cb(Vcur, "Vcur", il);
|
10089
10126
|
|
10090
|
-
Qcur =
|
10091
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
10127
|
+
Qcur = ggml_rope_ext(
|
10128
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10092
10129
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10093
10130
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10094
10131
|
cb(Qcur, "Qcur", il);
|
@@ -10096,15 +10133,15 @@ struct llm_build_context {
|
|
10096
10133
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
10097
10134
|
cb(Qcur, "Qcur_scaled", il);
|
10098
10135
|
|
10099
|
-
Kcur =
|
10100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
10136
|
+
Kcur = ggml_rope_ext(
|
10137
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10101
10138
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10102
10139
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10103
10140
|
cb(Kcur, "Kcur", il);
|
10104
10141
|
|
10105
10142
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10106
10143
|
model.layers[il].wo, NULL,
|
10107
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10144
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10108
10145
|
}
|
10109
10146
|
|
10110
10147
|
if (il == n_layer - 1) {
|
@@ -10207,15 +10244,15 @@ struct llm_build_context {
|
|
10207
10244
|
cb(Vcur, "Vcur", il);
|
10208
10245
|
}
|
10209
10246
|
|
10210
|
-
Qcur =
|
10211
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10247
|
+
Qcur = ggml_rope_ext(
|
10248
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10212
10249
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10213
10250
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10214
10251
|
);
|
10215
10252
|
cb(Qcur, "Qcur", il);
|
10216
10253
|
|
10217
|
-
Kcur =
|
10218
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10254
|
+
Kcur = ggml_rope_ext(
|
10255
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10219
10256
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10220
10257
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10221
10258
|
);
|
@@ -10223,7 +10260,7 @@ struct llm_build_context {
|
|
10223
10260
|
|
10224
10261
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10225
10262
|
model.layers[il].wo, model.layers[il].bo,
|
10226
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10263
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10227
10264
|
}
|
10228
10265
|
|
10229
10266
|
if (il == n_layer - 1) {
|
@@ -10490,22 +10527,267 @@ struct llm_build_context {
|
|
10490
10527
|
LLM_NORM, cb, il);
|
10491
10528
|
cb(Qcur, "Qcur", il);
|
10492
10529
|
|
10493
|
-
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
10494
|
-
model.layers[il].attn_k_norm,
|
10495
|
-
NULL,
|
10496
|
-
LLM_NORM, cb, il);
|
10497
|
-
cb(Kcur, "Kcur", il);
|
10498
|
-
}
|
10530
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
10531
|
+
model.layers[il].attn_k_norm,
|
10532
|
+
NULL,
|
10533
|
+
LLM_NORM, cb, il);
|
10534
|
+
cb(Kcur, "Kcur", il);
|
10535
|
+
}
|
10536
|
+
|
10537
|
+
Qcur = ggml_rope_ext(
|
10538
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10539
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10540
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10541
|
+
);
|
10542
|
+
cb(Qcur, "Qcur", il);
|
10543
|
+
|
10544
|
+
Kcur = ggml_rope_ext(
|
10545
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10546
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10547
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10548
|
+
);
|
10549
|
+
cb(Kcur, "Kcur", il);
|
10550
|
+
|
10551
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10552
|
+
model.layers[il].wo, model.layers[il].bo,
|
10553
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10554
|
+
}
|
10555
|
+
|
10556
|
+
if (il == n_layer - 1) {
|
10557
|
+
// skip computing output for unused tokens
|
10558
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10559
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10560
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10561
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
10562
|
+
}
|
10563
|
+
|
10564
|
+
struct ggml_tensor * attn_out = cur;
|
10565
|
+
|
10566
|
+
// feed-forward network
|
10567
|
+
{
|
10568
|
+
cur = llm_build_ffn(ctx0, ffn_inp,
|
10569
|
+
model.layers[il].ffn_up, NULL,
|
10570
|
+
model.layers[il].ffn_gate, NULL,
|
10571
|
+
model.layers[il].ffn_down, NULL,
|
10572
|
+
NULL,
|
10573
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10574
|
+
cb(cur, "ffn_out", il);
|
10575
|
+
}
|
10576
|
+
|
10577
|
+
// add together residual + FFN + self-attention
|
10578
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10579
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
10580
|
+
cb(cur, "l_out", il);
|
10581
|
+
|
10582
|
+
// input for next layer
|
10583
|
+
inpL = cur;
|
10584
|
+
}
|
10585
|
+
|
10586
|
+
cur = inpL;
|
10587
|
+
|
10588
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10589
|
+
model.output_norm, NULL,
|
10590
|
+
LLM_NORM, cb, -1);
|
10591
|
+
cb(cur, "result_norm", -1);
|
10592
|
+
|
10593
|
+
// lm_head
|
10594
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10595
|
+
|
10596
|
+
if (f_logit_scale) {
|
10597
|
+
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
10598
|
+
}
|
10599
|
+
|
10600
|
+
cb(cur, "result_output", -1);
|
10601
|
+
|
10602
|
+
ggml_build_forward_expand(gf, cur);
|
10603
|
+
|
10604
|
+
return gf;
|
10605
|
+
|
10606
|
+
}
|
10607
|
+
|
10608
|
+
// ref: https://allenai.org/olmo
|
10609
|
+
// based on the original build_llama() function, changes:
|
10610
|
+
// * non-parametric layer norm
|
10611
|
+
// * clamp qkv
|
10612
|
+
// * removed bias
|
10613
|
+
// * removed MoE
|
10614
|
+
struct ggml_cgraph * build_olmo() {
|
10615
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10616
|
+
|
10617
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10618
|
+
int32_t n_tokens = this->n_tokens;
|
10619
|
+
|
10620
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10621
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10622
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10623
|
+
|
10624
|
+
struct ggml_tensor * cur;
|
10625
|
+
struct ggml_tensor * inpL;
|
10626
|
+
|
10627
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10628
|
+
|
10629
|
+
// inp_pos - contains the positions
|
10630
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10631
|
+
|
10632
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10633
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10634
|
+
|
10635
|
+
for (int il = 0; il < n_layer; ++il) {
|
10636
|
+
struct ggml_tensor * inpSA = inpL;
|
10637
|
+
|
10638
|
+
// norm
|
10639
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10640
|
+
NULL, NULL,
|
10641
|
+
LLM_NORM, cb, il);
|
10642
|
+
cb(cur, "attn_norm", il);
|
10643
|
+
|
10644
|
+
// self-attention
|
10645
|
+
{
|
10646
|
+
// compute Q and K and RoPE them
|
10647
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10648
|
+
cb(Qcur, "Qcur", il);
|
10649
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10650
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10651
|
+
cb(Qcur, "Qcur", il);
|
10652
|
+
}
|
10653
|
+
|
10654
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10655
|
+
cb(Kcur, "Kcur", il);
|
10656
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10657
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10658
|
+
cb(Kcur, "Kcur", il);
|
10659
|
+
}
|
10660
|
+
|
10661
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10662
|
+
cb(Vcur, "Vcur", il);
|
10663
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10664
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10665
|
+
cb(Vcur, "Vcur", il);
|
10666
|
+
}
|
10667
|
+
|
10668
|
+
Qcur = ggml_rope_ext(
|
10669
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10670
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10671
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10672
|
+
);
|
10673
|
+
cb(Qcur, "Qcur", il);
|
10674
|
+
|
10675
|
+
Kcur = ggml_rope_ext(
|
10676
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10677
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10678
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10679
|
+
);
|
10680
|
+
cb(Kcur, "Kcur", il);
|
10681
|
+
|
10682
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10683
|
+
model.layers[il].wo, nullptr,
|
10684
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10685
|
+
}
|
10686
|
+
|
10687
|
+
if (il == n_layer - 1) {
|
10688
|
+
// skip computing output for unused tokens
|
10689
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10690
|
+
n_tokens = n_outputs;
|
10691
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10692
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10693
|
+
}
|
10694
|
+
|
10695
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10696
|
+
cb(ffn_inp, "ffn_inp", il);
|
10697
|
+
|
10698
|
+
// feed-forward network
|
10699
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10700
|
+
NULL, NULL,
|
10701
|
+
LLM_NORM, cb, il);
|
10702
|
+
cb(cur, "ffn_norm", il);
|
10703
|
+
|
10704
|
+
cur = llm_build_ffn(ctx0, cur,
|
10705
|
+
model.layers[il].ffn_up, NULL,
|
10706
|
+
model.layers[il].ffn_gate, NULL,
|
10707
|
+
model.layers[il].ffn_down, NULL,
|
10708
|
+
NULL,
|
10709
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10710
|
+
cb(cur, "ffn_out", il);
|
10711
|
+
|
10712
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10713
|
+
cb(cur, "ffn_out", il);
|
10714
|
+
|
10715
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10716
|
+
if (layer_dir != nullptr) {
|
10717
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10718
|
+
}
|
10719
|
+
cb(cur, "l_out", il);
|
10720
|
+
|
10721
|
+
// input for next layer
|
10722
|
+
inpL = cur;
|
10723
|
+
}
|
10724
|
+
|
10725
|
+
cur = inpL;
|
10726
|
+
|
10727
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10728
|
+
NULL, NULL,
|
10729
|
+
LLM_NORM, cb, -1);
|
10730
|
+
cb(cur, "result_norm", -1);
|
10731
|
+
|
10732
|
+
// lm_head
|
10733
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10734
|
+
cb(cur, "result_output", -1);
|
10735
|
+
|
10736
|
+
ggml_build_forward_expand(gf, cur);
|
10737
|
+
|
10738
|
+
return gf;
|
10739
|
+
}
|
10740
|
+
|
10741
|
+
struct ggml_cgraph * build_gptneox() {
|
10742
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10743
|
+
|
10744
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10745
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
10746
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10747
|
+
|
10748
|
+
struct ggml_tensor * cur;
|
10749
|
+
struct ggml_tensor * inpL;
|
10750
|
+
|
10751
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10752
|
+
|
10753
|
+
// inp_pos - contains the positions
|
10754
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10755
|
+
|
10756
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10757
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10758
|
+
|
10759
|
+
for (int il = 0; il < n_layer; ++il) {
|
10760
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10761
|
+
model.layers[il].attn_norm,
|
10762
|
+
model.layers[il].attn_norm_b,
|
10763
|
+
LLM_NORM, cb, il);
|
10764
|
+
cb(cur, "attn_norm", il);
|
10765
|
+
|
10766
|
+
// self-attention
|
10767
|
+
{
|
10768
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
10769
|
+
cb(cur, "wqkv", il);
|
10770
|
+
|
10771
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
10772
|
+
cb(cur, "bqkv", il);
|
10773
|
+
|
10774
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
10775
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10776
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10777
|
+
|
10778
|
+
cb(Qcur, "Qcur", il);
|
10779
|
+
cb(Kcur, "Kcur", il);
|
10780
|
+
cb(Vcur, "Vcur", il);
|
10499
10781
|
|
10500
|
-
Qcur =
|
10501
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10782
|
+
Qcur = ggml_rope_ext(
|
10783
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10502
10784
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10503
10785
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10504
10786
|
);
|
10505
10787
|
cb(Qcur, "Qcur", il);
|
10506
10788
|
|
10507
|
-
Kcur =
|
10508
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10789
|
+
Kcur = ggml_rope_ext(
|
10790
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10509
10791
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10510
10792
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10511
10793
|
);
|
@@ -10513,68 +10795,84 @@ struct llm_build_context {
|
|
10513
10795
|
|
10514
10796
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10515
10797
|
model.layers[il].wo, model.layers[il].bo,
|
10516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10798
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10517
10799
|
}
|
10518
10800
|
|
10519
10801
|
if (il == n_layer - 1) {
|
10520
10802
|
// skip computing output for unused tokens
|
10521
10803
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10522
|
-
cur
|
10523
|
-
inpL
|
10524
|
-
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
10804
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10805
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10525
10806
|
}
|
10526
10807
|
|
10527
|
-
|
10808
|
+
// ffn
|
10809
|
+
if (hparams.use_par_res) {
|
10810
|
+
// attention and ffn are computed in parallel
|
10811
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
10528
10812
|
|
10529
|
-
|
10530
|
-
|
10531
|
-
cur =
|
10532
|
-
model.layers[il].
|
10533
|
-
model.layers[il].
|
10534
|
-
|
10813
|
+
struct ggml_tensor * attn_out = cur;
|
10814
|
+
|
10815
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10816
|
+
model.layers[il].ffn_norm,
|
10817
|
+
model.layers[il].ffn_norm_b,
|
10818
|
+
LLM_NORM, cb, il);
|
10819
|
+
cb(cur, "ffn_norm", il);
|
10820
|
+
|
10821
|
+
cur = llm_build_ffn(ctx0, cur,
|
10822
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10823
|
+
NULL, NULL,
|
10824
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10535
10825
|
NULL,
|
10536
|
-
|
10826
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10537
10827
|
cb(cur, "ffn_out", il);
|
10538
|
-
}
|
10539
10828
|
|
10540
|
-
|
10541
|
-
|
10542
|
-
cur = ggml_add(ctx0, cur, attn_out);
|
10543
|
-
cb(cur, "l_out", il);
|
10829
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10830
|
+
cb(cur, "ffn_out", il);
|
10544
10831
|
|
10545
|
-
|
10546
|
-
|
10547
|
-
|
10832
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
10833
|
+
cb(inpL, "l_out", il);
|
10834
|
+
} else {
|
10835
|
+
// attention and ffn are computed sequentially
|
10836
|
+
// x = x + attn(ln1(x))
|
10837
|
+
// x = x + ffn(ln2(x))
|
10548
10838
|
|
10549
|
-
|
10839
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
10840
|
+
cb(ffn_inp, "ffn_inp", il);
|
10550
10841
|
|
10551
|
-
|
10552
|
-
|
10553
|
-
|
10554
|
-
|
10842
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10843
|
+
model.layers[il].ffn_norm,
|
10844
|
+
model.layers[il].ffn_norm_b,
|
10845
|
+
LLM_NORM, cb, il);
|
10846
|
+
cb(cur, "ffn_norm", il);
|
10555
10847
|
|
10556
|
-
|
10557
|
-
|
10848
|
+
cur = llm_build_ffn(ctx0, cur,
|
10849
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10850
|
+
NULL, NULL,
|
10851
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10852
|
+
NULL,
|
10853
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10854
|
+
cb(cur, "ffn_out", il);
|
10558
10855
|
|
10559
|
-
|
10560
|
-
|
10856
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
10857
|
+
cb(inpL, "l_out", il);
|
10858
|
+
}
|
10561
10859
|
}
|
10562
10860
|
|
10861
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10862
|
+
model.output_norm,
|
10863
|
+
model.output_norm_b,
|
10864
|
+
LLM_NORM, cb, -1);
|
10865
|
+
cb(cur, "result_norm", -1);
|
10866
|
+
|
10867
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10563
10868
|
cb(cur, "result_output", -1);
|
10564
10869
|
|
10565
10870
|
ggml_build_forward_expand(gf, cur);
|
10566
10871
|
|
10567
10872
|
return gf;
|
10568
|
-
|
10569
10873
|
}
|
10570
10874
|
|
10571
|
-
|
10572
|
-
// based on the original build_llama() function, changes:
|
10573
|
-
// * non-parametric layer norm
|
10574
|
-
// * clamp qkv
|
10575
|
-
// * removed bias
|
10576
|
-
// * removed MoE
|
10577
|
-
struct ggml_cgraph * build_olmo() {
|
10875
|
+
struct ggml_cgraph * build_arctic() {
|
10578
10876
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10579
10877
|
|
10580
10878
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -10600,8 +10898,8 @@ struct llm_build_context {
|
|
10600
10898
|
|
10601
10899
|
// norm
|
10602
10900
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
10603
|
-
|
10604
|
-
|
10901
|
+
model.layers[il].attn_norm, NULL,
|
10902
|
+
LLM_NORM_RMS, cb, il);
|
10605
10903
|
cb(cur, "attn_norm", il);
|
10606
10904
|
|
10607
10905
|
// self-attention
|
@@ -10609,42 +10907,30 @@ struct llm_build_context {
|
|
10609
10907
|
// compute Q and K and RoPE them
|
10610
10908
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10611
10909
|
cb(Qcur, "Qcur", il);
|
10612
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10613
|
-
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10614
|
-
cb(Qcur, "Qcur", il);
|
10615
|
-
}
|
10616
10910
|
|
10617
10911
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10618
10912
|
cb(Kcur, "Kcur", il);
|
10619
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10620
|
-
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10621
|
-
cb(Kcur, "Kcur", il);
|
10622
|
-
}
|
10623
10913
|
|
10624
10914
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10625
10915
|
cb(Vcur, "Vcur", il);
|
10626
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10627
|
-
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10628
|
-
cb(Vcur, "Vcur", il);
|
10629
|
-
}
|
10630
10916
|
|
10631
|
-
Qcur =
|
10632
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10917
|
+
Qcur = ggml_rope_ext(
|
10918
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10633
10919
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10634
10920
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10635
10921
|
);
|
10636
10922
|
cb(Qcur, "Qcur", il);
|
10637
10923
|
|
10638
|
-
Kcur =
|
10639
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10924
|
+
Kcur = ggml_rope_ext(
|
10925
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10640
10926
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10641
10927
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10642
10928
|
);
|
10643
10929
|
cb(Kcur, "Kcur", il);
|
10644
10930
|
|
10645
10931
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10646
|
-
model.layers[il].wo,
|
10647
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10932
|
+
model.layers[il].wo, NULL,
|
10933
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10648
10934
|
}
|
10649
10935
|
|
10650
10936
|
if (il == n_layer - 1) {
|
@@ -10660,8 +10946,8 @@ struct llm_build_context {
|
|
10660
10946
|
|
10661
10947
|
// feed-forward network
|
10662
10948
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10663
|
-
|
10664
|
-
|
10949
|
+
model.layers[il].ffn_norm, NULL,
|
10950
|
+
LLM_NORM_RMS, cb, il);
|
10665
10951
|
cb(cur, "ffn_norm", il);
|
10666
10952
|
|
10667
10953
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -10672,7 +10958,26 @@ struct llm_build_context {
|
|
10672
10958
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10673
10959
|
cb(cur, "ffn_out", il);
|
10674
10960
|
|
10675
|
-
|
10961
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
10962
|
+
cb(ffn_out, "ffn_out", il);
|
10963
|
+
|
10964
|
+
// MoE
|
10965
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
10966
|
+
model.layers[il].ffn_norm_exps, NULL,
|
10967
|
+
LLM_NORM_RMS, cb, il);
|
10968
|
+
cb(cur, "ffn_norm_exps", il);
|
10969
|
+
|
10970
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
10971
|
+
model.layers[il].ffn_gate_inp,
|
10972
|
+
model.layers[il].ffn_up_exps,
|
10973
|
+
model.layers[il].ffn_gate_exps,
|
10974
|
+
model.layers[il].ffn_down_exps,
|
10975
|
+
n_expert, n_expert_used,
|
10976
|
+
LLM_FFN_SILU, true,
|
10977
|
+
cb, il);
|
10978
|
+
cb(cur, "ffn_moe_out", il);
|
10979
|
+
|
10980
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
10676
10981
|
cb(cur, "ffn_out", il);
|
10677
10982
|
|
10678
10983
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
@@ -10688,8 +10993,8 @@ struct llm_build_context {
|
|
10688
10993
|
cur = inpL;
|
10689
10994
|
|
10690
10995
|
cur = llm_build_norm(ctx0, cur, hparams,
|
10691
|
-
|
10692
|
-
|
10996
|
+
model.output_norm, NULL,
|
10997
|
+
LLM_NORM_RMS, cb, -1);
|
10693
10998
|
cb(cur, "result_norm", -1);
|
10694
10999
|
|
10695
11000
|
// lm_head
|
@@ -10816,15 +11121,12 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10816
11121
|
{
|
10817
11122
|
result = llm.build_starcoder();
|
10818
11123
|
} break;
|
10819
|
-
case LLM_ARCH_PERSIMMON:
|
10820
|
-
{
|
10821
|
-
result = llm.build_persimmon();
|
10822
|
-
} break;
|
10823
11124
|
case LLM_ARCH_REFACT:
|
10824
11125
|
{
|
10825
11126
|
result = llm.build_refact();
|
10826
11127
|
} break;
|
10827
11128
|
case LLM_ARCH_BERT:
|
11129
|
+
case LLM_ARCH_JINA_BERT_V2:
|
10828
11130
|
case LLM_ARCH_NOMIC_BERT:
|
10829
11131
|
{
|
10830
11132
|
result = llm.build_bert();
|
@@ -10913,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10913
11215
|
{
|
10914
11216
|
result = llm.build_olmo();
|
10915
11217
|
} break;
|
11218
|
+
case LLM_ARCH_GPTNEOX:
|
11219
|
+
{
|
11220
|
+
result = llm.build_gptneox();
|
11221
|
+
} break;
|
11222
|
+
case LLM_ARCH_ARCTIC:
|
11223
|
+
{
|
11224
|
+
result = llm.build_arctic();
|
11225
|
+
} break;
|
10916
11226
|
default:
|
10917
11227
|
GGML_ASSERT(false);
|
10918
11228
|
}
|
@@ -11032,11 +11342,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11032
11342
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
11033
11343
|
f = -INFINITY;
|
11034
11344
|
} else {
|
11035
|
-
|
11345
|
+
if (hparams.use_alibi) {
|
11346
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
11347
|
+
} else {
|
11348
|
+
f = 0.0f;
|
11349
|
+
}
|
11036
11350
|
}
|
11037
11351
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
11038
11352
|
}
|
11039
11353
|
}
|
11354
|
+
|
11355
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
11356
|
+
for (int j = 0; j < n_kv; ++j) {
|
11357
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
11358
|
+
}
|
11359
|
+
}
|
11040
11360
|
}
|
11041
11361
|
} else {
|
11042
11362
|
// when using kv cache, the mask needs to match the kv cache size
|
@@ -11055,7 +11375,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11055
11375
|
float f = -INFINITY;
|
11056
11376
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
11057
11377
|
if (batch.seq_id[i][s] == seq_id) {
|
11058
|
-
|
11378
|
+
if (hparams.use_alibi) {
|
11379
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
11380
|
+
} else {
|
11381
|
+
f = 0.0f;
|
11382
|
+
}
|
11059
11383
|
break;
|
11060
11384
|
}
|
11061
11385
|
}
|
@@ -11071,21 +11395,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11071
11395
|
}
|
11072
11396
|
}
|
11073
11397
|
|
11074
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11075
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11076
|
-
if (hparams.use_alibi) {
|
11077
|
-
const int64_t n_kv = kv_self.n;
|
11078
|
-
|
11079
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
11080
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
11081
|
-
|
11082
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
11083
|
-
|
11084
|
-
for (int i = 0; i < n_kv; ++i) {
|
11085
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
11086
|
-
}
|
11087
|
-
}
|
11088
|
-
|
11089
11398
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
11090
11399
|
const int64_t n_tokens = batch.n_tokens;
|
11091
11400
|
|
@@ -11259,11 +11568,6 @@ static void llama_graph_compute(
|
|
11259
11568
|
llama_context & lctx,
|
11260
11569
|
ggml_cgraph * gf,
|
11261
11570
|
int n_threads) {
|
11262
|
-
#ifdef GGML_USE_MPI
|
11263
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
11264
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
11265
|
-
#endif
|
11266
|
-
|
11267
11571
|
#ifdef GGML_USE_METAL
|
11268
11572
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
11269
11573
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -11278,10 +11582,6 @@ static void llama_graph_compute(
|
|
11278
11582
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11279
11583
|
|
11280
11584
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
11281
|
-
|
11282
|
-
#ifdef GGML_USE_MPI
|
11283
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
11284
|
-
#endif
|
11285
11585
|
}
|
11286
11586
|
|
11287
11587
|
// decode a batch of tokens by evaluating the transformer
|
@@ -11319,12 +11619,6 @@ static int llama_decode_internal(
|
|
11319
11619
|
}
|
11320
11620
|
lctx.n_queued_tokens += n_tokens_all;
|
11321
11621
|
|
11322
|
-
#ifdef GGML_USE_MPI
|
11323
|
-
// TODO: needs fix after #3228
|
11324
|
-
GGML_ASSERT(false && "not implemented");
|
11325
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
11326
|
-
#endif
|
11327
|
-
|
11328
11622
|
auto & kv_self = lctx.kv_self;
|
11329
11623
|
|
11330
11624
|
const int64_t n_embd = hparams.n_embd;
|
@@ -11455,7 +11749,8 @@ static int llama_decode_internal(
|
|
11455
11749
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11456
11750
|
// after enough generations, the benefit from this heuristic disappears
|
11457
11751
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11458
|
-
|
11752
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
11753
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
11459
11754
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11460
11755
|
}
|
11461
11756
|
}
|
@@ -12200,13 +12495,14 @@ struct llm_tokenizer_bpe {
|
|
12200
12495
|
|
12201
12496
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12202
12497
|
int final_prev_index = -1;
|
12498
|
+
bool ignore_merges = false;
|
12203
12499
|
|
12204
12500
|
std::vector<std::string> word_collection;
|
12205
12501
|
switch (vocab.type) {
|
12206
12502
|
case LLAMA_VOCAB_TYPE_BPE:
|
12207
12503
|
switch (vocab.type_pre) {
|
12208
12504
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12209
|
-
|
12505
|
+
ignore_merges = true;
|
12210
12506
|
word_collection = unicode_regex_split(text, {
|
12211
12507
|
// original regex from tokenizer.json
|
12212
12508
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12215,6 +12511,12 @@ struct llm_tokenizer_bpe {
|
|
12215
12511
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12216
12512
|
});
|
12217
12513
|
break;
|
12514
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12515
|
+
word_collection = unicode_regex_split(text, {
|
12516
|
+
// same as llama3
|
12517
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12518
|
+
});
|
12519
|
+
break;
|
12218
12520
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12219
12521
|
word_collection = unicode_regex_split(text, {
|
12220
12522
|
"[\r\n]",
|
@@ -12266,6 +12568,7 @@ struct llm_tokenizer_bpe {
|
|
12266
12568
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12267
12569
|
});
|
12268
12570
|
break;
|
12571
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
12269
12572
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12270
12573
|
word_collection = unicode_regex_split(text, {
|
12271
12574
|
// original regex from tokenizer.json
|
@@ -12298,6 +12601,11 @@ struct llm_tokenizer_bpe {
|
|
12298
12601
|
int index = 0;
|
12299
12602
|
size_t offset = 0;
|
12300
12603
|
|
12604
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
12605
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
12606
|
+
offset = word.size();
|
12607
|
+
}
|
12608
|
+
|
12301
12609
|
while (offset < word.size()) {
|
12302
12610
|
llm_symbol sym;
|
12303
12611
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
@@ -12483,16 +12791,16 @@ struct llm_tokenizer_wpm {
|
|
12483
12791
|
// to lowercase, pad chinese characters, pad punctuation
|
12484
12792
|
std::string new_str = "";
|
12485
12793
|
for (uint32_t code : cpts_nfd) {
|
12486
|
-
|
12487
|
-
if (
|
12794
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
12795
|
+
if (flags.is_accent_mark || flags.is_control) {
|
12488
12796
|
continue;
|
12489
12797
|
}
|
12490
12798
|
code = unicode_tolower(code);
|
12491
|
-
if (
|
12799
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
12492
12800
|
code = ' ';
|
12493
12801
|
}
|
12494
12802
|
std::string s = unicode_cpt_to_utf8(code);
|
12495
|
-
if (
|
12803
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12496
12804
|
new_str += " ";
|
12497
12805
|
new_str += s;
|
12498
12806
|
new_str += " ";
|
@@ -12695,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12695
13003
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12696
13004
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
12697
13005
|
|
13006
|
+
static const bool rtrim = true; //TODO: as param
|
13007
|
+
bool is_prev_special = false;
|
13008
|
+
bool special_token_rtrim = false;
|
13009
|
+
|
12698
13010
|
if (add_special && vocab.special_add_bos != 0) {
|
12699
13011
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12700
13012
|
output.push_back(vocab.special_bos_id);
|
13013
|
+
is_prev_special = true;
|
12701
13014
|
}
|
12702
13015
|
|
12703
13016
|
for (const auto & fragment : fragment_buffer) {
|
@@ -12709,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12709
13022
|
// and passing 'add space prefix' as bool argument
|
12710
13023
|
//
|
12711
13024
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
12712
|
-
|
12713
|
-
|
12714
|
-
|
13025
|
+
|
13026
|
+
if (special_token_rtrim) {
|
13027
|
+
size_t num_whitespaces = 0;
|
13028
|
+
while (isspace(raw_text[num_whitespaces])) {
|
13029
|
+
num_whitespaces++;
|
13030
|
+
}
|
13031
|
+
if (num_whitespaces == raw_text.size()) {
|
13032
|
+
continue; // skip if all whitespaces
|
13033
|
+
}
|
13034
|
+
raw_text = raw_text.substr(num_whitespaces);
|
13035
|
+
}
|
13036
|
+
|
13037
|
+
if (vocab.add_space_prefix) {
|
13038
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13039
|
+
raw_text = " " + raw_text;
|
12715
13040
|
}
|
12716
13041
|
}
|
12717
13042
|
|
@@ -12723,9 +13048,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12723
13048
|
tokenizer.tokenize(raw_text, output);
|
12724
13049
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
12725
13050
|
output.push_back(fragment.token);
|
13051
|
+
is_prev_special = true;
|
13052
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13053
|
+
special_token_rtrim = rtrim
|
13054
|
+
&& fragment.token != vocab.special_bos_id
|
13055
|
+
&& fragment.token != vocab.special_unk_id
|
13056
|
+
&& fragment.token != vocab.special_eos_id;
|
12726
13057
|
}
|
12727
13058
|
}
|
12728
13059
|
|
13060
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13061
|
+
LLAMA_LOG_WARN(
|
13062
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13063
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13064
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13065
|
+
}
|
13066
|
+
|
12729
13067
|
if (add_special && vocab.special_add_eos == 1) {
|
12730
13068
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
12731
13069
|
output.push_back(vocab.special_eos_id);
|
@@ -12752,7 +13090,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12752
13090
|
}
|
12753
13091
|
}
|
12754
13092
|
|
12755
|
-
|
13093
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13094
|
+
LLAMA_LOG_WARN(
|
13095
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13096
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13097
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13098
|
+
}
|
13099
|
+
|
13100
|
+
if (add_special && vocab.special_add_eos == 1) {
|
13101
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
13102
|
+
output.push_back(vocab.special_eos_id);
|
13103
|
+
}
|
12756
13104
|
} break;
|
12757
13105
|
case LLAMA_VOCAB_TYPE_WPM:
|
12758
13106
|
{
|
@@ -13106,6 +13454,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
13106
13454
|
return rejects;
|
13107
13455
|
}
|
13108
13456
|
|
13457
|
+
static bool llama_grammar_detect_left_recursion(
|
13458
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
13459
|
+
size_t rule_index,
|
13460
|
+
std::vector<bool> * rules_visited,
|
13461
|
+
std::vector<bool> * rules_in_progress,
|
13462
|
+
std::vector<bool> * rules_may_be_empty) {
|
13463
|
+
if ((*rules_in_progress)[rule_index]) {
|
13464
|
+
return true;
|
13465
|
+
}
|
13466
|
+
|
13467
|
+
(*rules_in_progress)[rule_index] = true;
|
13468
|
+
|
13469
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
13470
|
+
|
13471
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
13472
|
+
// step but it's more readable as two steps.
|
13473
|
+
bool at_rule_start = true;
|
13474
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13475
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13476
|
+
if (at_rule_start) {
|
13477
|
+
(*rules_may_be_empty)[rule_index] = true;
|
13478
|
+
break;
|
13479
|
+
}
|
13480
|
+
at_rule_start = true;
|
13481
|
+
} else {
|
13482
|
+
at_rule_start = false;
|
13483
|
+
}
|
13484
|
+
}
|
13485
|
+
|
13486
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
13487
|
+
// be empty)
|
13488
|
+
bool recurse_into_nonterminal = true;
|
13489
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13490
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
13491
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
13492
|
+
return true;
|
13493
|
+
}
|
13494
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
13495
|
+
recurse_into_nonterminal = false;
|
13496
|
+
}
|
13497
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13498
|
+
recurse_into_nonterminal = true;
|
13499
|
+
} else {
|
13500
|
+
recurse_into_nonterminal = false;
|
13501
|
+
}
|
13502
|
+
}
|
13503
|
+
|
13504
|
+
(*rules_in_progress)[rule_index] = false;
|
13505
|
+
(*rules_visited)[rule_index] = true;
|
13506
|
+
return false;
|
13507
|
+
}
|
13508
|
+
|
13109
13509
|
//
|
13110
13510
|
// grammar - external
|
13111
13511
|
//
|
@@ -13125,6 +13525,19 @@ struct llama_grammar * llama_grammar_init(
|
|
13125
13525
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
13126
13526
|
}
|
13127
13527
|
|
13528
|
+
// Check for left recursion
|
13529
|
+
std::vector<bool> rules_visited(n_rules);
|
13530
|
+
std::vector<bool> rules_in_progress(n_rules);
|
13531
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
13532
|
+
for (size_t i = 0; i < n_rules; i++) {
|
13533
|
+
if (rules_visited[i]) {
|
13534
|
+
continue;
|
13535
|
+
}
|
13536
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
13537
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
13538
|
+
}
|
13539
|
+
}
|
13540
|
+
|
13128
13541
|
// loop over alternates of start rule to build initial stacks
|
13129
13542
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
13130
13543
|
pos = vec_rules[start_rule_index].data();
|
@@ -13147,6 +13560,9 @@ struct llama_grammar * llama_grammar_init(
|
|
13147
13560
|
}
|
13148
13561
|
} while (true);
|
13149
13562
|
|
13563
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
13564
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
13565
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
13150
13566
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
13151
13567
|
}
|
13152
13568
|
|
@@ -13741,9 +14157,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13741
14157
|
|
13742
14158
|
// Sample the next word X using top-k sampling
|
13743
14159
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
13744
|
-
|
13745
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13746
|
-
}
|
14160
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13747
14161
|
llama_token X = llama_sample_token(ctx, candidates);
|
13748
14162
|
t_start_sample_us = ggml_time_us();
|
13749
14163
|
|
@@ -13757,9 +14171,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13757
14171
|
// Update mu using the learning rate and error
|
13758
14172
|
*mu = *mu - eta * e;
|
13759
14173
|
|
13760
|
-
|
13761
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13762
|
-
}
|
14174
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13763
14175
|
return X;
|
13764
14176
|
}
|
13765
14177
|
|
@@ -14344,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14344
14756
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
14345
14757
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
14346
14758
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
14347
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
14348
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
14349
14759
|
if (qs.model.type == MODEL_70B) {
|
14350
14760
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
14351
14761
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
@@ -15246,6 +15656,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15246
15656
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
15247
15657
|
/*.main_gpu =*/ 0,
|
15248
15658
|
/*.tensor_split =*/ nullptr,
|
15659
|
+
/*.rpc_servers =*/ nullptr,
|
15249
15660
|
/*.progress_callback =*/ nullptr,
|
15250
15661
|
/*.progress_callback_user_data =*/ nullptr,
|
15251
15662
|
/*.kv_overrides =*/ nullptr,
|
@@ -15316,7 +15727,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
15316
15727
|
}
|
15317
15728
|
|
15318
15729
|
size_t llama_max_devices(void) {
|
15319
|
-
#if defined(
|
15730
|
+
#if defined(GGML_USE_RPC)
|
15731
|
+
return GGML_RPC_MAX_SERVERS;
|
15732
|
+
#elif defined(GGML_USE_METAL)
|
15320
15733
|
return 1;
|
15321
15734
|
#elif defined(GGML_USE_CUDA)
|
15322
15735
|
return GGML_CUDA_MAX_DEVICES;
|
@@ -15339,7 +15752,7 @@ bool llama_supports_mlock(void) {
|
|
15339
15752
|
|
15340
15753
|
bool llama_supports_gpu_offload(void) {
|
15341
15754
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
15342
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
15755
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
15343
15756
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
15344
15757
|
return true;
|
15345
15758
|
#else
|
@@ -15356,10 +15769,6 @@ void llama_backend_init(void) {
|
|
15356
15769
|
struct ggml_context * ctx = ggml_init(params);
|
15357
15770
|
ggml_free(ctx);
|
15358
15771
|
}
|
15359
|
-
|
15360
|
-
#ifdef GGML_USE_MPI
|
15361
|
-
ggml_mpi_backend_init();
|
15362
|
-
#endif
|
15363
15772
|
}
|
15364
15773
|
|
15365
15774
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
@@ -15369,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
15369
15778
|
}
|
15370
15779
|
|
15371
15780
|
void llama_backend_free(void) {
|
15372
|
-
#ifdef GGML_USE_MPI
|
15373
|
-
ggml_mpi_backend_free();
|
15374
|
-
#endif
|
15375
15781
|
ggml_quantize_free();
|
15376
15782
|
}
|
15377
15783
|
|
@@ -15402,7 +15808,17 @@ struct llama_model * llama_load_model_from_file(
|
|
15402
15808
|
return true;
|
15403
15809
|
};
|
15404
15810
|
}
|
15405
|
-
|
15811
|
+
if (params.rpc_servers != nullptr) {
|
15812
|
+
// split the servers set them into model->rpc_servers
|
15813
|
+
std::string servers(params.rpc_servers);
|
15814
|
+
size_t pos = 0;
|
15815
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
15816
|
+
std::string server = servers.substr(0, pos);
|
15817
|
+
model->rpc_servers.push_back(server);
|
15818
|
+
servers.erase(0, pos + 1);
|
15819
|
+
}
|
15820
|
+
model->rpc_servers.push_back(servers);
|
15821
|
+
}
|
15406
15822
|
int status = llama_model_load(path_model, *model, params);
|
15407
15823
|
GGML_ASSERT(status <= 0);
|
15408
15824
|
if (status < 0) {
|
@@ -15441,6 +15857,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15441
15857
|
return nullptr;
|
15442
15858
|
}
|
15443
15859
|
|
15860
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15861
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15862
|
+
params.flash_attn = false;
|
15863
|
+
}
|
15864
|
+
|
15444
15865
|
llama_context * ctx = new llama_context(*model);
|
15445
15866
|
|
15446
15867
|
const auto & hparams = model->hparams;
|
@@ -15464,7 +15885,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15464
15885
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15465
15886
|
|
15466
15887
|
// this is necessary due to kv_self.n being padded later during inference
|
15467
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
15888
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
15468
15889
|
|
15469
15890
|
// with causal attention, the batch size is limited by the context size
|
15470
15891
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
@@ -15499,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15499
15920
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
15500
15921
|
}
|
15501
15922
|
|
15923
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
15502
15924
|
cparams.causal_attn = hparams.causal_attn;
|
15503
15925
|
|
15504
15926
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
@@ -15509,16 +15931,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15509
15931
|
}
|
15510
15932
|
}
|
15511
15933
|
|
15512
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
15513
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15514
|
-
cparams.flash_attn = false;
|
15515
|
-
}
|
15516
|
-
|
15517
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15518
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15519
|
-
cparams.flash_attn = false;
|
15520
|
-
}
|
15521
|
-
|
15522
15934
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15523
15935
|
params.seed = time(NULL);
|
15524
15936
|
}
|
@@ -15554,7 +15966,17 @@ struct llama_context * llama_new_context_with_model(
|
|
15554
15966
|
|
15555
15967
|
if (!hparams.vocab_only) {
|
15556
15968
|
// initialize backends
|
15557
|
-
#
|
15969
|
+
#if defined(GGML_USE_RPC)
|
15970
|
+
for (auto & server : model->rpc_servers) {
|
15971
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
15972
|
+
if (backend == nullptr) {
|
15973
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
15974
|
+
llama_free(ctx);
|
15975
|
+
return nullptr;
|
15976
|
+
}
|
15977
|
+
ctx->backends.push_back(backend);
|
15978
|
+
}
|
15979
|
+
#elif defined(GGML_USE_METAL)
|
15558
15980
|
if (model->n_gpu_layers > 0) {
|
15559
15981
|
ctx->backend_metal = ggml_backend_metal_init();
|
15560
15982
|
if (ctx->backend_metal == nullptr) {
|
@@ -15710,7 +16132,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15710
16132
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
15711
16133
|
|
15712
16134
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
15713
|
-
bool pipeline_parallel =
|
16135
|
+
bool pipeline_parallel =
|
16136
|
+
llama_get_device_count(*model) > 1 &&
|
16137
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
16138
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
16139
|
+
params.offload_kqv;
|
15714
16140
|
#ifndef GGML_USE_CUDA
|
15715
16141
|
// pipeline parallelism requires support for async compute and events
|
15716
16142
|
// currently this is only implemented in the CUDA backend
|
@@ -15753,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15753
16179
|
}
|
15754
16180
|
}
|
15755
16181
|
|
15756
|
-
#ifdef GGML_USE_MPI
|
15757
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
15758
|
-
|
15759
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
15760
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
15761
|
-
// TODO: needs fix after #3228
|
15762
|
-
GGML_ASSERT(false && "not implemented");
|
15763
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
15764
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
15765
|
-
llama_backend_free();
|
15766
|
-
exit(1);
|
15767
|
-
}
|
15768
|
-
#endif
|
15769
|
-
|
15770
16182
|
return ctx;
|
15771
16183
|
}
|
15772
16184
|
|
@@ -15803,11 +16215,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15803
16215
|
// these models do not use RoPE
|
15804
16216
|
case LLM_ARCH_GPT2:
|
15805
16217
|
case LLM_ARCH_GPTJ:
|
15806
|
-
case LLM_ARCH_GPTNEOX:
|
15807
16218
|
case LLM_ARCH_MPT:
|
15808
16219
|
case LLM_ARCH_REFACT:
|
15809
16220
|
case LLM_ARCH_BLOOM:
|
15810
16221
|
case LLM_ARCH_MAMBA:
|
16222
|
+
case LLM_ARCH_JINA_BERT_V2:
|
15811
16223
|
return LLAMA_ROPE_TYPE_NONE;
|
15812
16224
|
|
15813
16225
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -15822,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15822
16234
|
case LLM_ARCH_XVERSE:
|
15823
16235
|
case LLM_ARCH_COMMAND_R:
|
15824
16236
|
case LLM_ARCH_OLMO:
|
16237
|
+
case LLM_ARCH_ARCTIC:
|
15825
16238
|
return LLAMA_ROPE_TYPE_NORM;
|
15826
16239
|
|
15827
16240
|
// the pairs of head values are offset by n_rot/2
|
15828
16241
|
case LLM_ARCH_FALCON:
|
15829
16242
|
case LLM_ARCH_GROK:
|
15830
16243
|
case LLM_ARCH_DBRX:
|
15831
|
-
case LLM_ARCH_PERSIMMON:
|
15832
16244
|
case LLM_ARCH_BERT:
|
15833
16245
|
case LLM_ARCH_NOMIC_BERT:
|
15834
16246
|
case LLM_ARCH_STABLELM:
|
@@ -15839,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15839
16251
|
case LLM_ARCH_PHI3:
|
15840
16252
|
case LLM_ARCH_GEMMA:
|
15841
16253
|
case LLM_ARCH_STARCODER2:
|
16254
|
+
case LLM_ARCH_GPTNEOX:
|
15842
16255
|
return LLAMA_ROPE_TYPE_NEOX;
|
15843
16256
|
|
15844
16257
|
// all model arches should be listed explicitly here
|
@@ -15998,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
15998
16411
|
}
|
15999
16412
|
|
16000
16413
|
// make tensors
|
16414
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
16001
16415
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
16002
16416
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
16003
16417
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
@@ -16006,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16006
16420
|
}
|
16007
16421
|
|
16008
16422
|
// allocate tensors / buffers and zero
|
16423
|
+
cvec.ctxs.reserve(ctx_map.size());
|
16424
|
+
cvec.bufs.reserve(ctx_map.size());
|
16009
16425
|
for (auto it : ctx_map) {
|
16010
16426
|
ggml_backend_buffer_type_t buft = it.first;
|
16011
16427
|
ggml_context * ctx = it.second;
|
@@ -16829,13 +17245,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16829
17245
|
}
|
16830
17246
|
else {
|
16831
17247
|
if (cell_range_begin != kv_self.size) {
|
16832
|
-
cell_ranges.
|
17248
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
16833
17249
|
cell_range_begin = kv_self.size;
|
16834
17250
|
}
|
16835
17251
|
}
|
16836
17252
|
}
|
16837
17253
|
if (cell_range_begin != kv_self.size) {
|
16838
|
-
cell_ranges.
|
17254
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
16839
17255
|
}
|
16840
17256
|
|
16841
17257
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
@@ -17214,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
17214
17630
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
17215
17631
|
}
|
17216
17632
|
|
17633
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
17634
|
+
return ctx->cparams.n_threads;
|
17635
|
+
}
|
17636
|
+
|
17637
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
17638
|
+
return ctx->cparams.n_threads_batch;
|
17639
|
+
}
|
17640
|
+
|
17217
17641
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
17218
17642
|
ctx->abort_callback = abort_callback;
|
17219
17643
|
ctx->abort_callback_data = abort_callback_data;
|
@@ -17648,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
17648
18072
|
}
|
17649
18073
|
}
|
17650
18074
|
// llama2 templates seem to not care about "add_generation_prompt"
|
18075
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
18076
|
+
// Phi 3
|
18077
|
+
for (auto message : chat) {
|
18078
|
+
std::string role(message->role);
|
18079
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
18080
|
+
}
|
18081
|
+
if (add_ass) {
|
18082
|
+
ss << "<|assistant|>\n";
|
18083
|
+
}
|
17651
18084
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
17652
18085
|
// zephyr template
|
17653
18086
|
for (auto message : chat) {
|
@@ -17780,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
17780
18213
|
if (add_ass) {
|
17781
18214
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17782
18215
|
}
|
17783
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17784
|
-
// Phi 3
|
17785
|
-
for (auto message : chat) {
|
17786
|
-
std::string role(message->role);
|
17787
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17788
|
-
}
|
17789
|
-
if (add_ass) {
|
17790
|
-
ss << "<|assistant|>\n";
|
17791
|
-
}
|
17792
18216
|
} else {
|
17793
18217
|
// template not supported
|
17794
18218
|
return -1;
|
@@ -17910,6 +18334,7 @@ const char * llama_print_system_info(void) {
|
|
17910
18334
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
17911
18335
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
17912
18336
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
18337
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
17913
18338
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
17914
18339
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
17915
18340
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
@@ -17970,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
17970
18395
|
g_state.log_callback_user_data = user_data;
|
17971
18396
|
#ifdef GGML_USE_METAL
|
17972
18397
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18398
|
+
#elif defined(GGML_USE_CUDA)
|
18399
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
17973
18400
|
#endif
|
17974
18401
|
}
|
17975
18402
|
|