llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
@@ -7,6 +7,10 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
+
#ifdef GGML_USE_RPC
|
11
|
+
# include "ggml-rpc.h"
|
12
|
+
#endif
|
13
|
+
|
10
14
|
#ifdef GGML_USE_CUDA
|
11
15
|
# include "ggml-cuda.h"
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -22,16 +26,9 @@
|
|
22
26
|
#ifdef GGML_USE_METAL
|
23
27
|
# include "ggml-metal.h"
|
24
28
|
#endif
|
25
|
-
|
26
|
-
|
27
|
-
#
|
28
|
-
#ifndef QK_K
|
29
|
-
# ifdef GGML_QKK_64
|
30
|
-
# define QK_K 64
|
31
|
-
# else
|
32
|
-
# define QK_K 256
|
33
|
-
# endif
|
34
|
-
#endif
|
29
|
+
|
30
|
+
// TODO: replace with ggml API call
|
31
|
+
#define QK_K 256
|
35
32
|
|
36
33
|
#ifdef __has_include
|
37
34
|
#if __has_include(<unistd.h>)
|
@@ -106,7 +103,7 @@
|
|
106
103
|
#endif
|
107
104
|
|
108
105
|
#define LLAMA_MAX_NODES 8192
|
109
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 128
|
110
107
|
|
111
108
|
//
|
112
109
|
// logging
|
@@ -201,10 +198,10 @@ enum llm_arch {
|
|
201
198
|
LLM_ARCH_GPTNEOX,
|
202
199
|
LLM_ARCH_MPT,
|
203
200
|
LLM_ARCH_STARCODER,
|
204
|
-
LLM_ARCH_PERSIMMON,
|
205
201
|
LLM_ARCH_REFACT,
|
206
202
|
LLM_ARCH_BERT,
|
207
203
|
LLM_ARCH_NOMIC_BERT,
|
204
|
+
LLM_ARCH_JINA_BERT_V2,
|
208
205
|
LLM_ARCH_BLOOM,
|
209
206
|
LLM_ARCH_STABLELM,
|
210
207
|
LLM_ARCH_QWEN,
|
@@ -224,43 +221,45 @@ enum llm_arch {
|
|
224
221
|
LLM_ARCH_COMMAND_R,
|
225
222
|
LLM_ARCH_DBRX,
|
226
223
|
LLM_ARCH_OLMO,
|
224
|
+
LLM_ARCH_ARCTIC,
|
227
225
|
LLM_ARCH_UNKNOWN,
|
228
226
|
};
|
229
227
|
|
230
228
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
240
|
-
{
|
241
|
-
{
|
242
|
-
{
|
243
|
-
{
|
244
|
-
{ LLM_ARCH_BLOOM, "bloom"
|
245
|
-
{ LLM_ARCH_STABLELM, "stablelm"
|
246
|
-
{ LLM_ARCH_QWEN, "qwen"
|
247
|
-
{ LLM_ARCH_QWEN2, "qwen2"
|
248
|
-
{ LLM_ARCH_QWEN2MOE, "qwen2moe"
|
249
|
-
{ LLM_ARCH_PHI2, "phi2"
|
250
|
-
{ LLM_ARCH_PHI3, "phi3"
|
251
|
-
{ LLM_ARCH_PLAMO, "plamo"
|
252
|
-
{ LLM_ARCH_CODESHELL, "codeshell"
|
253
|
-
{ LLM_ARCH_ORION, "orion"
|
254
|
-
{ LLM_ARCH_INTERNLM2, "internlm2"
|
255
|
-
{ LLM_ARCH_MINICPM, "minicpm"
|
256
|
-
{ LLM_ARCH_GEMMA, "gemma"
|
257
|
-
{ LLM_ARCH_STARCODER2, "starcoder2"
|
258
|
-
{ LLM_ARCH_MAMBA, "mamba"
|
259
|
-
{ LLM_ARCH_XVERSE, "xverse"
|
260
|
-
{ LLM_ARCH_COMMAND_R, "command-r"
|
261
|
-
{ LLM_ARCH_DBRX, "dbrx"
|
262
|
-
{ LLM_ARCH_OLMO, "olmo"
|
263
|
-
{
|
229
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
230
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
231
|
+
{ LLM_ARCH_GROK, "grok" },
|
232
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
233
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
234
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
235
|
+
{ LLM_ARCH_MPT, "mpt" },
|
236
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
237
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
238
|
+
{ LLM_ARCH_REFACT, "refact" },
|
239
|
+
{ LLM_ARCH_BERT, "bert" },
|
240
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
241
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
242
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
243
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
244
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
245
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
246
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
247
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
248
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
249
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
250
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
251
|
+
{ LLM_ARCH_ORION, "orion" },
|
252
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
253
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
254
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
255
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
256
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
257
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
258
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
261
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
262
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
264
263
|
};
|
265
264
|
|
266
265
|
enum llm_kv {
|
@@ -303,6 +302,7 @@ enum llm_kv {
|
|
303
302
|
LLM_KV_ROPE_SCALE_LINEAR,
|
304
303
|
LLM_KV_ROPE_SCALING_TYPE,
|
305
304
|
LLM_KV_ROPE_SCALING_FACTOR,
|
305
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
306
306
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
307
307
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
308
308
|
|
@@ -380,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
380
380
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
381
381
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
382
382
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
383
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
383
384
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
384
385
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
385
386
|
|
@@ -435,6 +436,8 @@ enum llm_tensor {
|
|
435
436
|
LLM_TENSOR_OUTPUT,
|
436
437
|
LLM_TENSOR_OUTPUT_NORM,
|
437
438
|
LLM_TENSOR_ROPE_FREQS,
|
439
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
440
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
438
441
|
LLM_TENSOR_ATTN_Q,
|
439
442
|
LLM_TENSOR_ATTN_K,
|
440
443
|
LLM_TENSOR_ATTN_V,
|
@@ -454,6 +457,7 @@ enum llm_tensor {
|
|
454
457
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
455
458
|
LLM_TENSOR_FFN_GATE_EXP,
|
456
459
|
LLM_TENSOR_FFN_UP_EXP,
|
460
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
457
461
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
458
462
|
LLM_TENSOR_FFN_GATE_EXPS,
|
459
463
|
LLM_TENSOR_FFN_UP_EXPS,
|
@@ -592,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
592
596
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
593
597
|
},
|
594
598
|
},
|
595
|
-
{
|
596
|
-
LLM_ARCH_PERSIMMON,
|
597
|
-
{
|
598
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
599
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
600
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
601
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
602
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
603
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
604
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
605
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
606
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
607
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
608
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
609
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
610
|
-
},
|
611
|
-
},
|
612
599
|
{
|
613
600
|
LLM_ARCH_MPT,
|
614
601
|
{
|
@@ -691,6 +678,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
691
678
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
692
679
|
},
|
693
680
|
},
|
681
|
+
{
|
682
|
+
LLM_ARCH_JINA_BERT_V2,
|
683
|
+
{
|
684
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
685
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
686
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
687
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
688
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
689
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
690
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
691
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
692
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
693
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
694
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
695
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
696
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
697
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
698
|
+
},
|
699
|
+
},
|
694
700
|
{
|
695
701
|
LLM_ARCH_BLOOM,
|
696
702
|
{
|
@@ -800,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
800
806
|
{
|
801
807
|
LLM_ARCH_PHI3,
|
802
808
|
{
|
803
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
804
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
805
|
-
{ LLM_TENSOR_OUTPUT,
|
806
|
-
{
|
807
|
-
{
|
808
|
-
{
|
809
|
-
{
|
810
|
-
{
|
811
|
-
{
|
812
|
-
{
|
813
|
-
{
|
814
|
-
{
|
809
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
810
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
811
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
812
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
813
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
815
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
816
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
817
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
815
823
|
},
|
816
824
|
},
|
817
825
|
{
|
@@ -1027,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1027
1035
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1028
1036
|
},
|
1029
1037
|
},
|
1038
|
+
{
|
1039
|
+
LLM_ARCH_ARCTIC,
|
1040
|
+
{
|
1041
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1042
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1043
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1044
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1045
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1046
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1047
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1048
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1049
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1050
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1051
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1052
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1053
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1054
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
1055
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1056
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1057
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
|
+
},
|
1059
|
+
},
|
1030
1060
|
{
|
1031
1061
|
LLM_ARCH_UNKNOWN,
|
1032
1062
|
{
|
@@ -1664,91 +1694,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1664
1694
|
GGML_UNUSED(host_buffer);
|
1665
1695
|
}
|
1666
1696
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1669
|
-
|
1670
|
-
#ifdef GGML_USE_METAL
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
1682
|
-
if (buft == nullptr) {
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1684
|
-
}
|
1685
|
-
#endif
|
1686
|
-
|
1687
|
-
if (buft == nullptr) {
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
1689
|
-
}
|
1690
|
-
return buft;
|
1691
|
-
|
1692
|
-
GGML_UNUSED(gpu);
|
1693
|
-
}
|
1694
|
-
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
1697
|
-
|
1698
|
-
#ifdef GGML_USE_CUDA
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1701
|
-
}
|
1702
|
-
#endif
|
1703
|
-
|
1704
|
-
#ifdef GGML_USE_SYCL
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1707
|
-
}
|
1708
|
-
#endif
|
1709
|
-
|
1710
|
-
if (buft == nullptr) {
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1712
|
-
}
|
1713
|
-
return buft;
|
1714
|
-
|
1715
|
-
GGML_UNUSED(tensor_split);
|
1716
|
-
}
|
1717
|
-
|
1718
|
-
static size_t llama_get_device_count() {
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
1725
|
-
#else
|
1726
|
-
return 1;
|
1727
|
-
#endif
|
1728
|
-
}
|
1729
|
-
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
1732
|
-
size_t total;
|
1733
|
-
size_t free;
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1735
|
-
return free;
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
1737
|
-
size_t total;
|
1738
|
-
size_t free;
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1740
|
-
return free;
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
1742
|
-
size_t total;
|
1743
|
-
size_t free;
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1745
|
-
return free;
|
1746
|
-
#else
|
1747
|
-
return 1;
|
1748
|
-
GGML_UNUSED(device);
|
1749
|
-
#endif
|
1750
|
-
}
|
1751
|
-
|
1752
1697
|
//
|
1753
1698
|
// globals
|
1754
1699
|
//
|
@@ -1757,6 +1702,8 @@ struct llama_state {
|
|
1757
1702
|
llama_state() {
|
1758
1703
|
#ifdef GGML_USE_METAL
|
1759
1704
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1705
|
+
#elif defined(GGML_USE_CUDA)
|
1706
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
1760
1707
|
#endif
|
1761
1708
|
}
|
1762
1709
|
|
@@ -1770,17 +1717,24 @@ static llama_state g_state;
|
|
1770
1717
|
// available llama models
|
1771
1718
|
enum e_model {
|
1772
1719
|
MODEL_UNKNOWN,
|
1720
|
+
MODEL_14M,
|
1773
1721
|
MODEL_17M,
|
1774
1722
|
MODEL_22M,
|
1775
1723
|
MODEL_33M,
|
1724
|
+
MODEL_70M,
|
1776
1725
|
MODEL_109M,
|
1777
1726
|
MODEL_137M,
|
1727
|
+
MODEL_160M,
|
1778
1728
|
MODEL_335M,
|
1729
|
+
MODEL_410M,
|
1779
1730
|
MODEL_0_5B,
|
1780
1731
|
MODEL_1B,
|
1732
|
+
MODEL_1_4B,
|
1781
1733
|
MODEL_2B,
|
1734
|
+
MODEL_2_8B,
|
1782
1735
|
MODEL_3B,
|
1783
1736
|
MODEL_4B,
|
1737
|
+
MODEL_6_9B,
|
1784
1738
|
MODEL_7B,
|
1785
1739
|
MODEL_8B,
|
1786
1740
|
MODEL_12B,
|
@@ -1803,6 +1757,7 @@ enum e_model {
|
|
1803
1757
|
MODEL_8x7B,
|
1804
1758
|
MODEL_8x22B,
|
1805
1759
|
MODEL_16x12B,
|
1760
|
+
MODEL_10B_128x3_66B,
|
1806
1761
|
};
|
1807
1762
|
|
1808
1763
|
static const size_t kiB = 1024;
|
@@ -1812,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
|
1812
1767
|
struct llama_hparams {
|
1813
1768
|
bool vocab_only;
|
1814
1769
|
bool rope_finetuned;
|
1770
|
+
bool use_par_res;
|
1815
1771
|
|
1816
1772
|
uint32_t n_vocab;
|
1817
1773
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -1830,6 +1786,7 @@ struct llama_hparams {
|
|
1830
1786
|
float f_norm_eps;
|
1831
1787
|
float f_norm_rms_eps;
|
1832
1788
|
|
1789
|
+
float rope_attn_factor = 1.0f;
|
1833
1790
|
float rope_freq_base_train;
|
1834
1791
|
float rope_freq_scale_train;
|
1835
1792
|
uint32_t n_yarn_orig_ctx;
|
@@ -1845,7 +1802,7 @@ struct llama_hparams {
|
|
1845
1802
|
float f_logit_scale = 0.0f;
|
1846
1803
|
|
1847
1804
|
bool causal_attn = true;
|
1848
|
-
bool use_alibi = false;
|
1805
|
+
bool use_alibi = false;
|
1849
1806
|
|
1850
1807
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1851
1808
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1878,6 +1835,7 @@ struct llama_hparams {
|
|
1878
1835
|
|
1879
1836
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1880
1837
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1838
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1881
1839
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1882
1840
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1883
1841
|
|
@@ -1975,6 +1933,7 @@ struct llama_layer {
|
|
1975
1933
|
struct ggml_tensor * ffn_norm_b;
|
1976
1934
|
struct ggml_tensor * layer_out_norm;
|
1977
1935
|
struct ggml_tensor * layer_out_norm_b;
|
1936
|
+
struct ggml_tensor * ffn_norm_exps;
|
1978
1937
|
|
1979
1938
|
// ff
|
1980
1939
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -2012,6 +1971,10 @@ struct llama_layer {
|
|
2012
1971
|
// mamba bias
|
2013
1972
|
struct ggml_tensor * ssm_conv1d_b;
|
2014
1973
|
struct ggml_tensor * ssm_dt_b;
|
1974
|
+
|
1975
|
+
// long rope factors
|
1976
|
+
struct ggml_tensor * rope_long = nullptr;
|
1977
|
+
struct ggml_tensor * rope_short = nullptr;
|
2015
1978
|
};
|
2016
1979
|
|
2017
1980
|
struct llama_kv_cell {
|
@@ -2189,6 +2152,8 @@ struct llama_model {
|
|
2189
2152
|
int main_gpu;
|
2190
2153
|
int n_gpu_layers;
|
2191
2154
|
|
2155
|
+
std::vector<std::string> rpc_servers;
|
2156
|
+
|
2192
2157
|
// gguf metadata
|
2193
2158
|
std::unordered_map<std::string, std::string> gguf_kv;
|
2194
2159
|
|
@@ -2317,7 +2282,6 @@ struct llama_context {
|
|
2317
2282
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2318
2283
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2319
2284
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2321
2285
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2322
2286
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2323
2287
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
@@ -2327,11 +2291,105 @@ struct llama_context {
|
|
2327
2291
|
|
2328
2292
|
// control vectors
|
2329
2293
|
struct llama_control_vector cvec;
|
2294
|
+
};
|
2295
|
+
|
2296
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2297
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2298
|
+
|
2299
|
+
#ifdef GGML_USE_RPC
|
2300
|
+
std::string endpoint = model.rpc_servers[gpu];
|
2301
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
2302
|
+
#elif defined(GGML_USE_METAL)
|
2303
|
+
buft = ggml_backend_metal_buffer_type();
|
2304
|
+
#elif defined(GGML_USE_CUDA)
|
2305
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
2306
|
+
#elif defined(GGML_USE_VULKAN)
|
2307
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
2308
|
+
#elif defined(GGML_USE_SYCL)
|
2309
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
2310
|
+
#elif defined(GGML_USE_CLBLAST)
|
2311
|
+
buft = ggml_backend_opencl_buffer_type();
|
2312
|
+
#elif defined(GGML_USE_KOMPUTE)
|
2313
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
2314
|
+
if (buft == nullptr) {
|
2315
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
2316
|
+
}
|
2317
|
+
#endif
|
2318
|
+
|
2319
|
+
if (buft == nullptr) {
|
2320
|
+
buft = llama_default_buffer_type_cpu(true);
|
2321
|
+
}
|
2322
|
+
return buft;
|
2323
|
+
GGML_UNUSED(model);
|
2324
|
+
GGML_UNUSED(gpu);
|
2325
|
+
}
|
2330
2326
|
|
2331
|
-
|
2332
|
-
|
2327
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
2328
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
2329
|
+
|
2330
|
+
#ifdef GGML_USE_CUDA
|
2331
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
2332
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
2333
|
+
}
|
2333
2334
|
#endif
|
2334
|
-
|
2335
|
+
|
2336
|
+
#ifdef GGML_USE_SYCL
|
2337
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
2338
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
2339
|
+
}
|
2340
|
+
#endif
|
2341
|
+
|
2342
|
+
if (buft == nullptr) {
|
2343
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
2344
|
+
}
|
2345
|
+
return buft;
|
2346
|
+
|
2347
|
+
GGML_UNUSED(tensor_split);
|
2348
|
+
}
|
2349
|
+
|
2350
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2351
|
+
#if defined(GGML_USE_RPC)
|
2352
|
+
return model.rpc_servers.size();
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
2354
|
+
return ggml_backend_cuda_get_device_count();
|
2355
|
+
#elif defined(GGML_USE_SYCL)
|
2356
|
+
return ggml_backend_sycl_get_device_count();
|
2357
|
+
#elif defined(GGML_USE_VULKAN)
|
2358
|
+
return ggml_backend_vk_get_device_count();
|
2359
|
+
#else
|
2360
|
+
return 1;
|
2361
|
+
#endif
|
2362
|
+
GGML_UNUSED(model);
|
2363
|
+
}
|
2364
|
+
|
2365
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2366
|
+
#if defined(GGML_USE_RPC)
|
2367
|
+
size_t total;
|
2368
|
+
size_t free;
|
2369
|
+
std::string endpoint = model.rpc_servers[device];
|
2370
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
2371
|
+
return free;
|
2372
|
+
#elif defined(GGML_USE_CUDA)
|
2373
|
+
size_t total;
|
2374
|
+
size_t free;
|
2375
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
2376
|
+
return free;
|
2377
|
+
#elif defined(GGML_USE_SYCL)
|
2378
|
+
size_t total;
|
2379
|
+
size_t free;
|
2380
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
2381
|
+
return free;
|
2382
|
+
#elif defined(GGML_USE_VULKAN)
|
2383
|
+
size_t total;
|
2384
|
+
size_t free;
|
2385
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
2386
|
+
return free;
|
2387
|
+
#else
|
2388
|
+
return 1;
|
2389
|
+
#endif
|
2390
|
+
GGML_UNUSED(model);
|
2391
|
+
GGML_UNUSED(device);
|
2392
|
+
}
|
2335
2393
|
|
2336
2394
|
//
|
2337
2395
|
// kv cache helpers
|
@@ -2452,7 +2510,6 @@ static bool llama_kv_cache_init(
|
|
2452
2510
|
static bool llama_kv_cache_find_slot(
|
2453
2511
|
struct llama_kv_cache & cache,
|
2454
2512
|
const struct llama_batch & batch) {
|
2455
|
-
const uint32_t n_ctx = cache.size;
|
2456
2513
|
const uint32_t n_tokens = batch.n_tokens;
|
2457
2514
|
|
2458
2515
|
if (cache.recurrent) {
|
@@ -2503,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
|
2503
2560
|
}
|
2504
2561
|
// otherwise, one cell per token.
|
2505
2562
|
|
2506
|
-
if (n_tokens >
|
2507
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
2563
|
+
if (n_tokens > cache.size) {
|
2564
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
2508
2565
|
return false;
|
2509
2566
|
}
|
2510
2567
|
|
2511
2568
|
uint32_t n_tested = 0;
|
2512
2569
|
|
2513
2570
|
while (true) {
|
2514
|
-
if (cache.head + n_tokens >
|
2515
|
-
n_tested +=
|
2571
|
+
if (cache.head + n_tokens > cache.size) {
|
2572
|
+
n_tested += cache.size - cache.head;
|
2516
2573
|
cache.head = 0;
|
2517
2574
|
continue;
|
2518
2575
|
}
|
@@ -2531,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
|
2531
2588
|
break;
|
2532
2589
|
}
|
2533
2590
|
|
2534
|
-
if (n_tested >=
|
2591
|
+
if (n_tested >= cache.size) {
|
2535
2592
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
2536
2593
|
return false;
|
2537
2594
|
}
|
@@ -2785,6 +2842,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
2785
2842
|
cache.do_defrag = true;
|
2786
2843
|
}
|
2787
2844
|
|
2845
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
2846
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
2847
|
+
return cparams.flash_attn ? 256u : 32u;
|
2848
|
+
}
|
2849
|
+
|
2788
2850
|
//
|
2789
2851
|
// model loading and saving
|
2790
2852
|
//
|
@@ -3287,22 +3349,55 @@ struct llama_model_loader {
|
|
3287
3349
|
}
|
3288
3350
|
|
3289
3351
|
template<typename T>
|
3290
|
-
bool
|
3291
|
-
|
3352
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
3353
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
3292
3354
|
|
3293
|
-
|
3294
|
-
|
3355
|
+
if (kid < 0) {
|
3356
|
+
if (required) {
|
3357
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3358
|
+
}
|
3359
|
+
return false;
|
3360
|
+
}
|
3295
3361
|
|
3296
|
-
|
3362
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
3363
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
3297
3364
|
|
3298
|
-
if (
|
3299
|
-
throw std::runtime_error(format("
|
3365
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
3366
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
3300
3367
|
}
|
3301
3368
|
|
3302
|
-
|
3303
|
-
|
3369
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
3370
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
3371
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
3304
3372
|
|
3305
|
-
|
3373
|
+
result.resize(arr_info.length);
|
3374
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
3375
|
+
|
3376
|
+
return true;
|
3377
|
+
}
|
3378
|
+
|
3379
|
+
template<typename T>
|
3380
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
3381
|
+
return get_arr(llm_kv(kid), result, required);
|
3382
|
+
}
|
3383
|
+
|
3384
|
+
template<typename T>
|
3385
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
3386
|
+
auto it = kv_overrides.find(key);
|
3387
|
+
|
3388
|
+
const struct llama_model_kv_override * override =
|
3389
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
3390
|
+
|
3391
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
3392
|
+
|
3393
|
+
if (required && !found) {
|
3394
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3395
|
+
}
|
3396
|
+
|
3397
|
+
return found;
|
3398
|
+
}
|
3399
|
+
|
3400
|
+
template<typename T>
|
3306
3401
|
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
3307
3402
|
return get_key(llm_kv(kid), result, required);
|
3308
3403
|
}
|
@@ -3360,11 +3455,15 @@ struct llama_model_loader {
|
|
3360
3455
|
return get_tensor_meta(get_tensor_name(i));
|
3361
3456
|
}
|
3362
3457
|
|
3363
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3458
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
3364
3459
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3365
3460
|
ggml_set_name(tensor, ggml_get_name(cur));
|
3366
3461
|
|
3367
|
-
|
3462
|
+
if (duplicated) {
|
3463
|
+
size_data += ggml_nbytes(cur);
|
3464
|
+
} else {
|
3465
|
+
n_created++;
|
3466
|
+
}
|
3368
3467
|
|
3369
3468
|
return tensor;
|
3370
3469
|
}
|
@@ -3399,14 +3498,17 @@ struct llama_model_loader {
|
|
3399
3498
|
return cur;
|
3400
3499
|
}
|
3401
3500
|
|
3402
|
-
|
3403
|
-
|
3501
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
3502
|
+
static const int TENSOR_DUPLICATED = 2;
|
3503
|
+
|
3504
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
3505
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
3404
3506
|
|
3405
3507
|
if (cur == NULL) {
|
3406
3508
|
return NULL;
|
3407
3509
|
}
|
3408
3510
|
|
3409
|
-
return create_tensor_for(ctx, cur);
|
3511
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
3410
3512
|
}
|
3411
3513
|
|
3412
3514
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
@@ -3706,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3706
3808
|
|
3707
3809
|
static const char * llama_model_type_name(e_model type) {
|
3708
3810
|
switch (type) {
|
3709
|
-
case
|
3710
|
-
case
|
3711
|
-
case
|
3712
|
-
case
|
3713
|
-
case
|
3714
|
-
case
|
3715
|
-
case
|
3716
|
-
case
|
3717
|
-
case
|
3718
|
-
case
|
3719
|
-
case
|
3720
|
-
case
|
3721
|
-
case
|
3722
|
-
case
|
3723
|
-
case
|
3724
|
-
case
|
3725
|
-
case
|
3726
|
-
case
|
3727
|
-
case
|
3728
|
-
case
|
3729
|
-
case
|
3730
|
-
case
|
3731
|
-
case
|
3732
|
-
case
|
3733
|
-
case
|
3734
|
-
case
|
3735
|
-
case
|
3736
|
-
case
|
3737
|
-
case
|
3738
|
-
case
|
3739
|
-
|
3811
|
+
case MODEL_14M: return "14M";
|
3812
|
+
case MODEL_17M: return "17M";
|
3813
|
+
case MODEL_22M: return "22M";
|
3814
|
+
case MODEL_33M: return "33M";
|
3815
|
+
case MODEL_70M: return "70M";
|
3816
|
+
case MODEL_109M: return "109M";
|
3817
|
+
case MODEL_137M: return "137M";
|
3818
|
+
case MODEL_160M: return "160M";
|
3819
|
+
case MODEL_335M: return "335M";
|
3820
|
+
case MODEL_410M: return "410M";
|
3821
|
+
case MODEL_0_5B: return "0.5B";
|
3822
|
+
case MODEL_1B: return "1B";
|
3823
|
+
case MODEL_1_4B: return "1.4B";
|
3824
|
+
case MODEL_2B: return "2B";
|
3825
|
+
case MODEL_2_8B: return "2.8B";
|
3826
|
+
case MODEL_3B: return "3B";
|
3827
|
+
case MODEL_4B: return "4B";
|
3828
|
+
case MODEL_6_9B: return "6.9B";
|
3829
|
+
case MODEL_7B: return "7B";
|
3830
|
+
case MODEL_8B: return "8B";
|
3831
|
+
case MODEL_12B: return "12B";
|
3832
|
+
case MODEL_13B: return "13B";
|
3833
|
+
case MODEL_14B: return "14B";
|
3834
|
+
case MODEL_15B: return "15B";
|
3835
|
+
case MODEL_20B: return "20B";
|
3836
|
+
case MODEL_30B: return "30B";
|
3837
|
+
case MODEL_34B: return "34B";
|
3838
|
+
case MODEL_35B: return "35B";
|
3839
|
+
case MODEL_40B: return "40B";
|
3840
|
+
case MODEL_65B: return "65B";
|
3841
|
+
case MODEL_70B: return "70B";
|
3842
|
+
case MODEL_314B: return "314B";
|
3843
|
+
case MODEL_SMALL: return "0.1B";
|
3844
|
+
case MODEL_MEDIUM: return "0.4B";
|
3845
|
+
case MODEL_LARGE: return "0.8B";
|
3846
|
+
case MODEL_XL: return "1.5B";
|
3847
|
+
case MODEL_A2_7B: return "A2.7B";
|
3848
|
+
case MODEL_8x7B: return "8x7B";
|
3849
|
+
case MODEL_8x22B: return "8x22B";
|
3850
|
+
case MODEL_16x12B: return "16x12B";
|
3851
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
3852
|
+
default: return "?B";
|
3740
3853
|
}
|
3741
3854
|
}
|
3742
3855
|
|
@@ -3779,6 +3892,12 @@ static void llm_load_hparams(
|
|
3779
3892
|
|
3780
3893
|
// get hparams kv
|
3781
3894
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3895
|
+
|
3896
|
+
// everything past this point is not vocab-related
|
3897
|
+
if (hparams.vocab_only) {
|
3898
|
+
return;
|
3899
|
+
}
|
3900
|
+
|
3782
3901
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3783
3902
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3784
3903
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
@@ -3823,6 +3942,8 @@ static void llm_load_hparams(
|
|
3823
3942
|
}
|
3824
3943
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
3825
3944
|
|
3945
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
3946
|
+
|
3826
3947
|
// sanity check for n_rot (optional)
|
3827
3948
|
{
|
3828
3949
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
@@ -3860,7 +3981,7 @@ static void llm_load_hparams(
|
|
3860
3981
|
switch (hparams.n_layer) {
|
3861
3982
|
case 22: model.type = e_model::MODEL_1B; break;
|
3862
3983
|
case 26: model.type = e_model::MODEL_3B; break;
|
3863
|
-
case 32: model.type = hparams.
|
3984
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
3864
3985
|
case 40: model.type = e_model::MODEL_13B; break;
|
3865
3986
|
case 48: model.type = e_model::MODEL_34B; break;
|
3866
3987
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3922,14 +4043,6 @@ static void llm_load_hparams(
|
|
3922
4043
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3923
4044
|
}
|
3924
4045
|
} break;
|
3925
|
-
case LLM_ARCH_PERSIMMON:
|
3926
|
-
{
|
3927
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3928
|
-
switch (hparams.n_layer) {
|
3929
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
3930
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
3931
|
-
}
|
3932
|
-
} break;
|
3933
4046
|
case LLM_ARCH_REFACT:
|
3934
4047
|
{
|
3935
4048
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -3962,6 +4075,19 @@ static void llm_load_hparams(
|
|
3962
4075
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3963
4076
|
}
|
3964
4077
|
} break;
|
4078
|
+
case LLM_ARCH_JINA_BERT_V2:
|
4079
|
+
{
|
4080
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
4082
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
4083
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
4084
|
+
hparams.f_max_alibi_bias = 8.0f;
|
4085
|
+
|
4086
|
+
switch (hparams.n_layer) {
|
4087
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
4088
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
4089
|
+
}
|
4090
|
+
} break;
|
3965
4091
|
case LLM_ARCH_NOMIC_BERT:
|
3966
4092
|
{
|
3967
4093
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -4058,6 +4184,7 @@ static void llm_load_hparams(
|
|
4058
4184
|
switch (hparams.n_layer) {
|
4059
4185
|
case 24: model.type = e_model::MODEL_1B; break;
|
4060
4186
|
case 32: model.type = e_model::MODEL_3B; break;
|
4187
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
4061
4188
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4062
4189
|
}
|
4063
4190
|
} break;
|
@@ -4198,6 +4325,65 @@ static void llm_load_hparams(
|
|
4198
4325
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4199
4326
|
}
|
4200
4327
|
} break;
|
4328
|
+
case LLM_ARCH_GPTNEOX:
|
4329
|
+
{
|
4330
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4331
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
4332
|
+
switch (hparams.n_layer) {
|
4333
|
+
case 6:
|
4334
|
+
switch (hparams.n_ff) {
|
4335
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
4336
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
4337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4338
|
+
} break;
|
4339
|
+
case 12:
|
4340
|
+
switch (hparams.n_ff) {
|
4341
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
4342
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4343
|
+
} break;
|
4344
|
+
case 16:
|
4345
|
+
switch (hparams.n_ff) {
|
4346
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
4347
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4348
|
+
} break;
|
4349
|
+
case 24:
|
4350
|
+
switch (hparams.n_ff) {
|
4351
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
4352
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
4353
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4354
|
+
} break;
|
4355
|
+
case 32:
|
4356
|
+
switch (hparams.n_ff) {
|
4357
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
4358
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
4359
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4360
|
+
} break;
|
4361
|
+
case 36:
|
4362
|
+
switch (hparams.n_ff) {
|
4363
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
4364
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4365
|
+
} break;
|
4366
|
+
case 44:
|
4367
|
+
switch (hparams.n_ff) {
|
4368
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
4369
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4370
|
+
} break;
|
4371
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4372
|
+
}
|
4373
|
+
} break;
|
4374
|
+
case LLM_ARCH_ARCTIC:
|
4375
|
+
{
|
4376
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4377
|
+
|
4378
|
+
if (hparams.n_expert == 128) {
|
4379
|
+
switch (hparams.n_layer) {
|
4380
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
4381
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4382
|
+
}
|
4383
|
+
} else {
|
4384
|
+
model.type = e_model::MODEL_UNKNOWN;
|
4385
|
+
}
|
4386
|
+
} break;
|
4201
4387
|
default: (void)0;
|
4202
4388
|
}
|
4203
4389
|
|
@@ -4383,7 +4569,11 @@ static void llm_load_vocab(
|
|
4383
4569
|
tokenizer_pre == "starcoder") {
|
4384
4570
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4385
4571
|
} else if (
|
4386
|
-
tokenizer_pre == "gpt-2"
|
4572
|
+
tokenizer_pre == "gpt-2" ||
|
4573
|
+
tokenizer_pre == "jina-es" ||
|
4574
|
+
tokenizer_pre == "jina-de" ||
|
4575
|
+
tokenizer_pre == "jina-v2-es" ||
|
4576
|
+
tokenizer_pre == "jina-v2-de") {
|
4387
4577
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4388
4578
|
} else if (
|
4389
4579
|
tokenizer_pre == "refact") {
|
@@ -4394,6 +4584,9 @@ static void llm_load_vocab(
|
|
4394
4584
|
} else if (
|
4395
4585
|
tokenizer_pre == "qwen2") {
|
4396
4586
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4587
|
+
} else if (
|
4588
|
+
tokenizer_pre == "stablelm2") {
|
4589
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
4397
4590
|
} else if (
|
4398
4591
|
tokenizer_pre == "olmo") {
|
4399
4592
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
@@ -4515,7 +4708,8 @@ static void llm_load_vocab(
|
|
4515
4708
|
(t.first == "<|eot_id|>" ||
|
4516
4709
|
t.first == "<|im_end|>" ||
|
4517
4710
|
t.first == "<|end|>" ||
|
4518
|
-
t.first == "<end_of_turn>"
|
4711
|
+
t.first == "<end_of_turn>" ||
|
4712
|
+
t.first == "<|endoftext|>"
|
4519
4713
|
)
|
4520
4714
|
) {
|
4521
4715
|
vocab.special_eot_id = t.second;
|
@@ -4743,13 +4937,13 @@ static bool llm_load_tensors(
|
|
4743
4937
|
|
4744
4938
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
4745
4939
|
// calculate the split points
|
4746
|
-
int device_count = llama_get_device_count();
|
4940
|
+
int device_count = llama_get_device_count(model);
|
4747
4941
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
4748
4942
|
std::vector<float> splits(device_count);
|
4749
4943
|
if (all_zero) {
|
4750
4944
|
// default split, by free memory
|
4751
4945
|
for (int i = 0; i < device_count; ++i) {
|
4752
|
-
splits[i] = llama_get_device_memory(i);
|
4946
|
+
splits[i] = llama_get_device_memory(model, i);
|
4753
4947
|
}
|
4754
4948
|
} else {
|
4755
4949
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
@@ -4769,35 +4963,35 @@ static bool llm_load_tensors(
|
|
4769
4963
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
4770
4964
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4771
4965
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
4772
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
4966
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
4773
4967
|
}
|
4774
4968
|
// assign the output layer
|
4775
4969
|
if (n_gpu_layers > n_layer) {
|
4776
4970
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
4777
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
4971
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
4778
4972
|
} else {
|
4779
4973
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
4780
4974
|
}
|
4781
4975
|
} else {
|
4782
4976
|
ggml_backend_buffer_type_t split_buft;
|
4783
4977
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
4784
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
4978
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
4785
4979
|
} else {
|
4786
4980
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
4787
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
4981
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
4788
4982
|
}
|
4789
4983
|
// assign the repeating layers
|
4790
4984
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
4791
4985
|
model.buft_layer[i] = {
|
4792
4986
|
split_buft,
|
4793
|
-
llama_default_buffer_type_offload(main_gpu)
|
4987
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4794
4988
|
};
|
4795
4989
|
}
|
4796
4990
|
// assign the output layer
|
4797
4991
|
if (n_gpu_layers > n_layer) {
|
4798
4992
|
model.buft_output = {
|
4799
4993
|
split_buft,
|
4800
|
-
llama_default_buffer_type_offload(main_gpu)
|
4994
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
4801
4995
|
};
|
4802
4996
|
} else {
|
4803
4997
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
@@ -4841,6 +5035,7 @@ static bool llm_load_tensors(
|
|
4841
5035
|
// create tensors for the weights
|
4842
5036
|
{
|
4843
5037
|
const int64_t n_embd = hparams.n_embd;
|
5038
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
4844
5039
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4845
5040
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4846
5041
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -4875,12 +5070,10 @@ static bool llm_load_tensors(
|
|
4875
5070
|
{
|
4876
5071
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4877
5072
|
if (model.arch != LLM_ARCH_MINICPM){
|
4878
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5073
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4879
5074
|
// if output is NULL, init from the input tok embed
|
4880
5075
|
if (model.output == NULL) {
|
4881
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4882
|
-
ml.n_created--; // artificial tensor
|
4883
|
-
ml.size_data += ggml_nbytes(model.output);
|
5076
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4884
5077
|
}
|
4885
5078
|
}
|
4886
5079
|
}
|
@@ -4899,10 +5092,10 @@ static bool llm_load_tensors(
|
|
4899
5092
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4900
5093
|
|
4901
5094
|
// optional bias tensors
|
4902
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
4903
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
4904
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
4905
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5095
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5096
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5097
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5098
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4906
5099
|
|
4907
5100
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4908
5101
|
|
@@ -4913,7 +5106,7 @@ static bool llm_load_tensors(
|
|
4913
5106
|
} else {
|
4914
5107
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4915
5108
|
|
4916
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5109
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4917
5110
|
if (layer.ffn_gate_exps) {
|
4918
5111
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4919
5112
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -4955,12 +5148,10 @@ static bool llm_load_tensors(
|
|
4955
5148
|
// output
|
4956
5149
|
{
|
4957
5150
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4958
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5151
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4959
5152
|
// if output is NULL, init from the input tok embed
|
4960
5153
|
if (model.output == NULL) {
|
4961
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4962
|
-
ml.n_created--; // artificial tensor
|
4963
|
-
ml.size_data += ggml_nbytes(model.output);
|
5154
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4964
5155
|
}
|
4965
5156
|
}
|
4966
5157
|
|
@@ -4983,7 +5174,7 @@ static bool llm_load_tensors(
|
|
4983
5174
|
|
4984
5175
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4985
5176
|
|
4986
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5177
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4987
5178
|
if (layer.ffn_gate_exps) {
|
4988
5179
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4989
5180
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5085,11 +5276,9 @@ static bool llm_load_tensors(
|
|
5085
5276
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5086
5277
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5087
5278
|
|
5088
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5279
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5089
5280
|
if (!model.output) {
|
5090
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5091
|
-
ml.n_created--; // artificial tensor
|
5092
|
-
ml.size_data += ggml_nbytes(model.output);
|
5281
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5093
5282
|
}
|
5094
5283
|
}
|
5095
5284
|
|
@@ -5102,8 +5291,8 @@ static bool llm_load_tensors(
|
|
5102
5291
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5103
5292
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5104
5293
|
|
5105
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
5106
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
5294
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5295
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5107
5296
|
|
5108
5297
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5109
5298
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -5121,7 +5310,12 @@ static bool llm_load_tensors(
|
|
5121
5310
|
{
|
5122
5311
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5123
5312
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5124
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5313
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5314
|
+
if (!model.output) {
|
5315
|
+
// needs to be on GPU
|
5316
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5317
|
+
}
|
5318
|
+
|
5125
5319
|
}
|
5126
5320
|
|
5127
5321
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5149,47 +5343,6 @@ static bool llm_load_tensors(
|
|
5149
5343
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5150
5344
|
}
|
5151
5345
|
} break;
|
5152
|
-
case LLM_ARCH_PERSIMMON:
|
5153
|
-
{
|
5154
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5155
|
-
|
5156
|
-
{
|
5157
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5158
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5159
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5160
|
-
}
|
5161
|
-
|
5162
|
-
for (int i = 0; i < n_layer; ++i) {
|
5163
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
5164
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5165
|
-
|
5166
|
-
auto & layer = model.layers[i];
|
5167
|
-
|
5168
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5169
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5170
|
-
|
5171
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5172
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
5173
|
-
|
5174
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5175
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
5176
|
-
|
5177
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5178
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5179
|
-
|
5180
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5181
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5182
|
-
|
5183
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5184
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
5185
|
-
|
5186
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
5187
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
5188
|
-
|
5189
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
5190
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
5191
|
-
}
|
5192
|
-
} break;
|
5193
5346
|
case LLM_ARCH_BERT:
|
5194
5347
|
case LLM_ARCH_NOMIC_BERT:
|
5195
5348
|
{
|
@@ -5242,6 +5395,50 @@ static bool llm_load_tensors(
|
|
5242
5395
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5243
5396
|
}
|
5244
5397
|
} break;
|
5398
|
+
case LLM_ARCH_JINA_BERT_V2:
|
5399
|
+
{
|
5400
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
5401
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
5402
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
5403
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
5404
|
+
|
5405
|
+
for (int i = 0; i < n_layer; ++i) {
|
5406
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5407
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5408
|
+
|
5409
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
5410
|
+
|
5411
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5412
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5413
|
+
|
5414
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5415
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5416
|
+
|
5417
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5418
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5419
|
+
|
5420
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5421
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5422
|
+
|
5423
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5424
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5425
|
+
|
5426
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
5427
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
5428
|
+
|
5429
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5430
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5431
|
+
|
5432
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5433
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5434
|
+
|
5435
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5436
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5437
|
+
|
5438
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
5439
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
5440
|
+
}
|
5441
|
+
} break;
|
5245
5442
|
case LLM_ARCH_BLOOM:
|
5246
5443
|
{
|
5247
5444
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5283,18 +5480,16 @@ static bool llm_load_tensors(
|
|
5283
5480
|
case LLM_ARCH_MPT:
|
5284
5481
|
{
|
5285
5482
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5286
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
5483
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5287
5484
|
|
5288
5485
|
// output
|
5289
5486
|
{
|
5290
5487
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5291
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
5488
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5292
5489
|
|
5293
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5490
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5294
5491
|
if (!model.output) {
|
5295
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5296
|
-
ml.n_created--; // artificial tensor
|
5297
|
-
ml.size_data += ggml_nbytes(model.output);
|
5492
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5298
5493
|
}
|
5299
5494
|
}
|
5300
5495
|
|
@@ -5305,31 +5500,31 @@ static bool llm_load_tensors(
|
|
5305
5500
|
auto & layer = model.layers[i];
|
5306
5501
|
|
5307
5502
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5308
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
5503
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5309
5504
|
|
5310
5505
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5311
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5506
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5312
5507
|
|
5313
5508
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5314
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5509
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5315
5510
|
|
5316
5511
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5317
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5512
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5318
5513
|
|
5319
5514
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5320
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
5515
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5321
5516
|
|
5322
5517
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5323
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
5518
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5324
5519
|
|
5325
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5326
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5520
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5521
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5327
5522
|
|
5328
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5329
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5523
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5524
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5330
5525
|
|
5331
5526
|
// AWQ ScaleActivation layer
|
5332
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
5527
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5333
5528
|
}
|
5334
5529
|
} break;
|
5335
5530
|
case LLM_ARCH_STABLELM:
|
@@ -5358,17 +5553,17 @@ static bool llm_load_tensors(
|
|
5358
5553
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5359
5554
|
|
5360
5555
|
// optional bias tensors, present in Stable LM 2 1.6B
|
5361
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
5362
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
5363
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
5556
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5557
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5558
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5364
5559
|
|
5365
5560
|
// optional q and k layernorms, present in StableLM 2 12B
|
5366
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
5367
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
5561
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5562
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5368
5563
|
|
5369
5564
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5370
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
5371
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5565
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5566
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5372
5567
|
|
5373
5568
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5374
5569
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5411,12 +5606,10 @@ static bool llm_load_tensors(
|
|
5411
5606
|
// output
|
5412
5607
|
{
|
5413
5608
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5414
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5609
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5415
5610
|
// if output is NULL, init from the input tok embed
|
5416
5611
|
if (model.output == NULL) {
|
5417
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5418
|
-
ml.n_created--; // artificial tensor
|
5419
|
-
ml.size_data += ggml_nbytes(model.output);
|
5612
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5420
5613
|
}
|
5421
5614
|
}
|
5422
5615
|
|
@@ -5514,8 +5707,8 @@ static bool llm_load_tensors(
|
|
5514
5707
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5515
5708
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5516
5709
|
|
5517
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
5518
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5710
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5711
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5519
5712
|
|
5520
5713
|
if (layer.wqkv == nullptr) {
|
5521
5714
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
@@ -5552,17 +5745,20 @@ static bool llm_load_tensors(
|
|
5552
5745
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
5553
5746
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5554
5747
|
|
5555
|
-
auto& layer = model.layers[i];
|
5748
|
+
auto & layer = model.layers[i];
|
5556
5749
|
|
5557
5750
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5558
5751
|
|
5559
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
5560
|
-
layer.wo
|
5752
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5753
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5561
5754
|
|
5562
5755
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5563
5756
|
|
5564
5757
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5565
5758
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5759
|
+
|
5760
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5761
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5566
5762
|
}
|
5567
5763
|
} break;
|
5568
5764
|
case LLM_ARCH_PLAMO:
|
@@ -5731,9 +5927,7 @@ static bool llm_load_tensors(
|
|
5731
5927
|
|
5732
5928
|
// output
|
5733
5929
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5734
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
5735
|
-
ml.n_created--; // artificial tensor
|
5736
|
-
ml.size_data += ggml_nbytes(model.output);
|
5930
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
5737
5931
|
|
5738
5932
|
const int64_t n_ff = hparams.n_ff;
|
5739
5933
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -5768,12 +5962,10 @@ static bool llm_load_tensors(
|
|
5768
5962
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5769
5963
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5770
5964
|
|
5771
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5965
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5772
5966
|
// if output is NULL, init from the input tok embed
|
5773
5967
|
if (model.output == NULL) {
|
5774
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5775
|
-
ml.n_created--; // artificial tensor
|
5776
|
-
ml.size_data += ggml_nbytes(model.output);
|
5968
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5777
5969
|
}
|
5778
5970
|
|
5779
5971
|
}
|
@@ -5824,12 +6016,10 @@ static bool llm_load_tensors(
|
|
5824
6016
|
{
|
5825
6017
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5826
6018
|
|
5827
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6019
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5828
6020
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
5829
6021
|
if (model.output == NULL) {
|
5830
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5831
|
-
ml.n_created--; // artificial tensor
|
5832
|
-
ml.size_data += ggml_nbytes(model.output);
|
6022
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5833
6023
|
}
|
5834
6024
|
}
|
5835
6025
|
|
@@ -5890,9 +6080,7 @@ static bool llm_load_tensors(
|
|
5890
6080
|
{
|
5891
6081
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5892
6082
|
// init output from the input tok embed
|
5893
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5894
|
-
ml.n_created--; // artificial tensor
|
5895
|
-
ml.size_data += ggml_nbytes(model.output);
|
6083
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5896
6084
|
}
|
5897
6085
|
|
5898
6086
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5924,12 +6112,10 @@ static bool llm_load_tensors(
|
|
5924
6112
|
|
5925
6113
|
// output
|
5926
6114
|
{
|
5927
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6115
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5928
6116
|
// if output is NULL, init from the input tok embed
|
5929
6117
|
if (model.output == NULL) {
|
5930
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5931
|
-
ml.n_created--; // artificial tensor
|
5932
|
-
ml.size_data += ggml_nbytes(model.output);
|
6118
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5933
6119
|
}
|
5934
6120
|
}
|
5935
6121
|
|
@@ -5949,6 +6135,81 @@ static bool llm_load_tensors(
|
|
5949
6135
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5950
6136
|
}
|
5951
6137
|
} break;
|
6138
|
+
case LLM_ARCH_GPTNEOX:
|
6139
|
+
{
|
6140
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6141
|
+
// output
|
6142
|
+
{
|
6143
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6144
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
6145
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6146
|
+
}
|
6147
|
+
|
6148
|
+
for (int i = 0; i < n_layer; ++i) {
|
6149
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6150
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6151
|
+
|
6152
|
+
auto & layer = model.layers[i];
|
6153
|
+
|
6154
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6155
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
6156
|
+
|
6157
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
6158
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
6159
|
+
|
6160
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6161
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
6162
|
+
|
6163
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6164
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
6165
|
+
|
6166
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
6167
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
6168
|
+
|
6169
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
6171
|
+
}
|
6172
|
+
} break;
|
6173
|
+
case LLM_ARCH_ARCTIC:
|
6174
|
+
{
|
6175
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6176
|
+
|
6177
|
+
// output
|
6178
|
+
{
|
6179
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6180
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6181
|
+
// if output is NULL, init from the input tok embed
|
6182
|
+
if (model.output == NULL) {
|
6183
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6184
|
+
}
|
6185
|
+
}
|
6186
|
+
|
6187
|
+
for (int i = 0; i < n_layer; ++i) {
|
6188
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6189
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6190
|
+
|
6191
|
+
auto & layer = model.layers[i];
|
6192
|
+
|
6193
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6194
|
+
|
6195
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
6196
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
6197
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
6198
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6199
|
+
|
6200
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6201
|
+
|
6202
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
6203
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
6204
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
6205
|
+
|
6206
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6207
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
6208
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
6209
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
6210
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
|
+
}
|
6212
|
+
} break;
|
5952
6213
|
default:
|
5953
6214
|
throw std::runtime_error("unknown architecture");
|
5954
6215
|
}
|
@@ -6213,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6213
6474
|
|
6214
6475
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
6215
6476
|
} else {
|
6216
|
-
|
6217
|
-
GGML_ASSERT(false && "not implemented");
|
6218
|
-
#endif
|
6219
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6477
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6220
6478
|
inpL = lctx.inp_embd;
|
6221
6479
|
ggml_set_input(lctx.inp_embd);
|
6222
6480
|
}
|
@@ -6318,7 +6576,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
6318
6576
|
llm_ffn_gate_type type_gate,
|
6319
6577
|
const llm_build_cb & cb,
|
6320
6578
|
int il) {
|
6321
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
6579
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
6322
6580
|
cb(tmp, "ffn_up", il);
|
6323
6581
|
|
6324
6582
|
if (up_b) {
|
@@ -6500,7 +6758,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6500
6758
|
struct ggml_tensor * wo_b,
|
6501
6759
|
struct ggml_tensor * q_cur,
|
6502
6760
|
struct ggml_tensor * kq_mask,
|
6503
|
-
struct ggml_tensor * kq_pos,
|
6504
6761
|
int32_t n_tokens,
|
6505
6762
|
int32_t n_kv,
|
6506
6763
|
float kq_scale,
|
@@ -6512,6 +6769,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6512
6769
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
6513
6770
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6514
6771
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
6772
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6515
6773
|
|
6516
6774
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
6517
6775
|
cb(q, "q", il);
|
@@ -6530,31 +6788,27 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6530
6788
|
GGML_UNUSED(model);
|
6531
6789
|
GGML_UNUSED(n_ctx);
|
6532
6790
|
|
6533
|
-
// note: if this assert triggers, then some check has failed earlier
|
6534
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6535
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6536
|
-
|
6537
6791
|
// split cached v into n_head heads (not transposed)
|
6538
6792
|
struct ggml_tensor * v =
|
6539
6793
|
ggml_view_3d(ctx, kv.v_l[il],
|
6540
6794
|
n_embd_head_v, n_kv, n_head_kv,
|
6541
|
-
ggml_row_size(kv.v_l[il]->type,
|
6542
|
-
ggml_row_size(kv.v_l[il]->type,
|
6795
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
6796
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
6543
6797
|
0);
|
6544
6798
|
cb(v, "v", il);
|
6545
6799
|
|
6546
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6800
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6547
6801
|
|
6548
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6802
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6549
6803
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6550
6804
|
}
|
6551
6805
|
|
6552
|
-
cur = ggml_reshape_2d(ctx, cur,
|
6806
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
6553
6807
|
} else {
|
6554
6808
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6555
6809
|
cb(kq, "kq", il);
|
6556
6810
|
|
6557
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6811
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6558
6812
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6559
6813
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6560
6814
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6574,28 +6828,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6574
6828
|
kq = ggml_scale(ctx, kq, 30);
|
6575
6829
|
}
|
6576
6830
|
|
6577
|
-
|
6578
|
-
|
6579
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6580
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6581
|
-
if (hparams.use_alibi) {
|
6582
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
6583
|
-
cb(kq, "kq_scaled", il);
|
6584
|
-
|
6585
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6586
|
-
cb(kq, "kq_scaled_alibi", il);
|
6587
|
-
|
6588
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
6589
|
-
cb(kq, "kq_masked", il);
|
6590
|
-
|
6591
|
-
kq = ggml_soft_max(ctx, kq);
|
6592
|
-
cb(kq, "kq_soft_max", il);
|
6593
|
-
} else
|
6594
|
-
#endif
|
6595
|
-
{
|
6596
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6597
|
-
cb(kq, "kq_soft_max_ext", il);
|
6598
|
-
}
|
6831
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6832
|
+
cb(kq, "kq_soft_max_ext", il);
|
6599
6833
|
|
6600
6834
|
GGML_ASSERT(kv.size == n_ctx);
|
6601
6835
|
|
@@ -6614,7 +6848,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6614
6848
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6615
6849
|
cb(kqv_merged, "kqv_merged", il);
|
6616
6850
|
|
6617
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
6851
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
6618
6852
|
cb(cur, "kqv_merged_cont", il);
|
6619
6853
|
}
|
6620
6854
|
|
@@ -6645,7 +6879,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6645
6879
|
struct ggml_tensor * v_cur,
|
6646
6880
|
struct ggml_tensor * q_cur,
|
6647
6881
|
struct ggml_tensor * kq_mask,
|
6648
|
-
struct ggml_tensor * kq_pos,
|
6649
6882
|
int32_t n_tokens,
|
6650
6883
|
int32_t kv_head,
|
6651
6884
|
int32_t n_kv,
|
@@ -6664,7 +6897,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6664
6897
|
struct ggml_tensor * cur;
|
6665
6898
|
|
6666
6899
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6667
|
-
q_cur, kq_mask,
|
6900
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
6668
6901
|
cb(cur, "kqv_out", il);
|
6669
6902
|
|
6670
6903
|
return cur;
|
@@ -6771,18 +7004,17 @@ struct llm_build_context {
|
|
6771
7004
|
|
6772
7005
|
ctx0 = ggml_init(params);
|
6773
7006
|
|
6774
|
-
lctx.inp_tokens
|
6775
|
-
lctx.inp_embd
|
6776
|
-
lctx.inp_pos
|
7007
|
+
lctx.inp_tokens = nullptr;
|
7008
|
+
lctx.inp_embd = nullptr;
|
7009
|
+
lctx.inp_pos = nullptr;
|
6777
7010
|
lctx.inp_out_ids = nullptr;
|
6778
7011
|
lctx.inp_KQ_mask = nullptr;
|
6779
|
-
lctx.inp_KQ_pos = nullptr;
|
6780
7012
|
lctx.inp_K_shift = nullptr;
|
6781
|
-
lctx.inp_mean
|
6782
|
-
lctx.inp_cls
|
6783
|
-
lctx.inp_s_copy
|
6784
|
-
lctx.inp_s_mask
|
6785
|
-
lctx.inp_s_seq
|
7013
|
+
lctx.inp_mean = nullptr;
|
7014
|
+
lctx.inp_cls = nullptr;
|
7015
|
+
lctx.inp_s_copy = nullptr;
|
7016
|
+
lctx.inp_s_mask = nullptr;
|
7017
|
+
lctx.inp_s_seq = nullptr;
|
6786
7018
|
}
|
6787
7019
|
|
6788
7020
|
void free() {
|
@@ -6801,17 +7033,20 @@ struct llm_build_context {
|
|
6801
7033
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
6802
7034
|
ggml_set_input(lctx.inp_K_shift);
|
6803
7035
|
|
7036
|
+
|
6804
7037
|
for (int il = 0; il < n_layer; ++il) {
|
7038
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
6805
7039
|
struct ggml_tensor * tmp =
|
6806
7040
|
// we rotate only the first n_rot dimensions
|
6807
|
-
|
7041
|
+
ggml_rope_ext_inplace(ctx0,
|
6808
7042
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
6809
7043
|
n_embd_head_k, n_head_kv, n_ctx,
|
6810
7044
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
6811
7045
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6812
7046
|
0),
|
6813
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7047
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6814
7048
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7049
|
+
|
6815
7050
|
cb(tmp, "K_shifted", il);
|
6816
7051
|
ggml_build_forward_expand(gf, tmp);
|
6817
7052
|
}
|
@@ -6914,6 +7149,17 @@ struct llm_build_context {
|
|
6914
7149
|
return lctx.inp_pos;
|
6915
7150
|
}
|
6916
7151
|
|
7152
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
7153
|
+
// choose long/short freq factors based on the context size
|
7154
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7155
|
+
|
7156
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
7157
|
+
return model.layers[il].rope_long;
|
7158
|
+
}
|
7159
|
+
|
7160
|
+
return model.layers[il].rope_short;
|
7161
|
+
}
|
7162
|
+
|
6917
7163
|
struct ggml_tensor * build_inp_out_ids() {
|
6918
7164
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6919
7165
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
@@ -6932,19 +7178,6 @@ struct llm_build_context {
|
|
6932
7178
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6933
7179
|
}
|
6934
7180
|
|
6935
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6936
|
-
if (causal) {
|
6937
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6938
|
-
} else {
|
6939
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
6940
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6941
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6942
|
-
}
|
6943
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6944
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
6945
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6946
|
-
}
|
6947
|
-
|
6948
7181
|
struct ggml_tensor * build_inp_mean() {
|
6949
7182
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6950
7183
|
cb(lctx.inp_mean, "inp_mean", -1);
|
@@ -7034,15 +7267,15 @@ struct llm_build_context {
|
|
7034
7267
|
cb(Vcur, "Vcur", il);
|
7035
7268
|
}
|
7036
7269
|
|
7037
|
-
Qcur =
|
7038
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7270
|
+
Qcur = ggml_rope_ext(
|
7271
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7039
7272
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7040
7273
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7041
7274
|
);
|
7042
7275
|
cb(Qcur, "Qcur", il);
|
7043
7276
|
|
7044
|
-
Kcur =
|
7045
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7277
|
+
Kcur = ggml_rope_ext(
|
7278
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7046
7279
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7047
7280
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7048
7281
|
);
|
@@ -7050,7 +7283,7 @@ struct llm_build_context {
|
|
7050
7283
|
|
7051
7284
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7052
7285
|
model.layers[il].wo, model.layers[il].bo,
|
7053
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7286
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7054
7287
|
}
|
7055
7288
|
|
7056
7289
|
if (il == n_layer - 1) {
|
@@ -7143,9 +7376,6 @@ struct llm_build_context {
|
|
7143
7376
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7144
7377
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7145
7378
|
|
7146
|
-
// positions of the tokens in the KV cache
|
7147
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7148
|
-
|
7149
7379
|
for (int il = 0; il < n_layer; ++il) {
|
7150
7380
|
struct ggml_tensor * inpSA = inpL;
|
7151
7381
|
|
@@ -7167,13 +7397,13 @@ struct llm_build_context {
|
|
7167
7397
|
|
7168
7398
|
switch (model.type) {
|
7169
7399
|
case MODEL_7B:
|
7170
|
-
Qcur =
|
7171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
Qcur = ggml_rope_ext(
|
7401
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7172
7402
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7173
7403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7174
7404
|
);
|
7175
|
-
Kcur =
|
7176
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7405
|
+
Kcur = ggml_rope_ext(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7177
7407
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7178
7408
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7179
7409
|
);
|
@@ -7190,7 +7420,7 @@ struct llm_build_context {
|
|
7190
7420
|
|
7191
7421
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7192
7422
|
model.layers[il].wo, NULL,
|
7193
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7423
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7194
7424
|
}
|
7195
7425
|
|
7196
7426
|
if (il == n_layer - 1) {
|
@@ -7260,9 +7490,6 @@ struct llm_build_context {
|
|
7260
7490
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7261
7491
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7262
7492
|
|
7263
|
-
// positions of the tokens in the KV cache
|
7264
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
7265
|
-
|
7266
7493
|
for (int il = 0; il < n_layer; ++il) {
|
7267
7494
|
struct ggml_tensor * inpSA = inpL;
|
7268
7495
|
|
@@ -7282,22 +7509,22 @@ struct llm_build_context {
|
|
7282
7509
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7283
7510
|
cb(Vcur, "Vcur", il);
|
7284
7511
|
|
7285
|
-
Qcur =
|
7286
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7512
|
+
Qcur = ggml_rope_ext(
|
7513
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7287
7514
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7288
7515
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7289
7516
|
);
|
7290
7517
|
cb(Qcur, "Qcur", il);
|
7291
7518
|
|
7292
|
-
Kcur =
|
7293
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7519
|
+
Kcur = ggml_rope_ext(
|
7520
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7294
7521
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7295
7522
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7296
7523
|
);
|
7297
7524
|
cb(Kcur, "Kcur", il);
|
7298
7525
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7299
7526
|
model.layers[il].wo, NULL,
|
7300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7527
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7301
7528
|
}
|
7302
7529
|
|
7303
7530
|
if (il == n_layer - 1) {
|
@@ -7403,21 +7630,21 @@ struct llm_build_context {
|
|
7403
7630
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7404
7631
|
|
7405
7632
|
// using mode = 2 for neox mode
|
7406
|
-
Qcur =
|
7407
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7633
|
+
Qcur = ggml_rope_ext(
|
7634
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7408
7635
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7409
7636
|
);
|
7410
7637
|
cb(Qcur, "Qcur", il);
|
7411
7638
|
|
7412
|
-
Kcur =
|
7413
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7639
|
+
Kcur = ggml_rope_ext(
|
7640
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7414
7641
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7415
7642
|
);
|
7416
7643
|
cb(Kcur, "Kcur", il);
|
7417
7644
|
|
7418
7645
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7419
7646
|
model.layers[il].wo, NULL,
|
7420
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7647
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7421
7648
|
}
|
7422
7649
|
|
7423
7650
|
if (il == n_layer - 1) {
|
@@ -7526,15 +7753,15 @@ struct llm_build_context {
|
|
7526
7753
|
cb(Vcur, "Vcur", il);
|
7527
7754
|
}
|
7528
7755
|
|
7529
|
-
Qcur =
|
7530
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7756
|
+
Qcur = ggml_rope_ext(
|
7757
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7531
7758
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7532
7759
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7533
7760
|
);
|
7534
7761
|
cb(Qcur, "Qcur", il);
|
7535
7762
|
|
7536
|
-
Kcur =
|
7537
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7763
|
+
Kcur = ggml_rope_ext(
|
7764
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7538
7765
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7539
7766
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7540
7767
|
);
|
@@ -7542,7 +7769,7 @@ struct llm_build_context {
|
|
7542
7769
|
|
7543
7770
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7544
7771
|
model.layers[il].wo, model.layers[il].bo,
|
7545
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7772
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7546
7773
|
}
|
7547
7774
|
|
7548
7775
|
if (il == n_layer - 1) {
|
@@ -7678,15 +7905,15 @@ struct llm_build_context {
|
|
7678
7905
|
cb(Kcur, "Kcur", il);
|
7679
7906
|
cb(Vcur, "Vcur", il);
|
7680
7907
|
|
7681
|
-
Qcur =
|
7682
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7908
|
+
Qcur = ggml_rope_ext(
|
7909
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7683
7910
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7684
7911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7685
7912
|
);
|
7686
7913
|
cb(Qcur, "Qcur", il);
|
7687
7914
|
|
7688
|
-
Kcur =
|
7689
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7915
|
+
Kcur = ggml_rope_ext(
|
7916
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7690
7917
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7691
7918
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7692
7919
|
);
|
@@ -7694,7 +7921,7 @@ struct llm_build_context {
|
|
7694
7921
|
|
7695
7922
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7696
7923
|
model.layers[il].wo, NULL,
|
7697
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7924
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7698
7925
|
}
|
7699
7926
|
|
7700
7927
|
if (il == n_layer - 1) {
|
@@ -7806,7 +8033,7 @@ struct llm_build_context {
|
|
7806
8033
|
|
7807
8034
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7808
8035
|
model.layers[il].wo, model.layers[il].bo,
|
7809
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8036
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7810
8037
|
}
|
7811
8038
|
|
7812
8039
|
if (il == n_layer - 1) {
|
@@ -7855,266 +8082,56 @@ struct llm_build_context {
|
|
7855
8082
|
return gf;
|
7856
8083
|
}
|
7857
8084
|
|
7858
|
-
struct ggml_cgraph *
|
8085
|
+
struct ggml_cgraph * build_refact() {
|
7859
8086
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7860
8087
|
|
7861
8088
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7862
|
-
GGML_ASSERT(n_embd_head
|
7863
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
8089
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7864
8090
|
|
7865
8091
|
struct ggml_tensor * cur;
|
7866
8092
|
struct ggml_tensor * inpL;
|
7867
8093
|
|
7868
8094
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7869
8095
|
|
7870
|
-
// inp_pos - contains the positions
|
7871
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
7872
|
-
|
7873
8096
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7874
8097
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7875
8098
|
|
7876
8099
|
for (int il = 0; il < n_layer; ++il) {
|
7877
|
-
struct ggml_tensor *
|
8100
|
+
struct ggml_tensor * inpSA = inpL;
|
7878
8101
|
|
7879
8102
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7880
|
-
model.layers[il].attn_norm,
|
7881
|
-
|
7882
|
-
LLM_NORM, cb, il);
|
8103
|
+
model.layers[il].attn_norm, NULL,
|
8104
|
+
LLM_NORM_RMS, cb, il);
|
7883
8105
|
cb(cur, "attn_norm", il);
|
7884
8106
|
|
7885
|
-
// self
|
8107
|
+
// self-attention
|
7886
8108
|
{
|
7887
|
-
|
7888
|
-
cb(
|
8109
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8110
|
+
cb(Qcur, "Qcur", il);
|
7889
8111
|
|
7890
|
-
|
7891
|
-
cb(
|
8112
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8113
|
+
cb(Kcur, "Kcur", il);
|
7892
8114
|
|
7893
|
-
|
7894
|
-
|
8115
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8116
|
+
cb(Vcur, "Vcur", il);
|
7895
8117
|
|
7896
|
-
|
7897
|
-
cb(
|
8118
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8119
|
+
cb(Kcur, "Kcur", il);
|
7898
8120
|
|
7899
|
-
|
7900
|
-
cb(
|
8121
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8122
|
+
cb(Qcur, "Qcur", il);
|
7901
8123
|
|
7902
|
-
|
7903
|
-
|
7904
|
-
|
7905
|
-
|
7906
|
-
0
|
7907
|
-
);
|
7908
|
-
cb(tmpq, "tmpq", il);
|
8124
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8125
|
+
model.layers[il].wo, NULL,
|
8126
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8127
|
+
}
|
7909
8128
|
|
7910
|
-
|
7911
|
-
|
7912
|
-
|
7913
|
-
|
7914
|
-
|
7915
|
-
|
7916
|
-
cb(tmpk, "tmpk", il);
|
7917
|
-
|
7918
|
-
// Q/K Layernorm
|
7919
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
7920
|
-
model.layers[il].attn_q_norm,
|
7921
|
-
model.layers[il].attn_q_norm_b,
|
7922
|
-
LLM_NORM, cb, il);
|
7923
|
-
cb(tmpq, "tmpq", il);
|
7924
|
-
|
7925
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
7926
|
-
model.layers[il].attn_k_norm,
|
7927
|
-
model.layers[il].attn_k_norm_b,
|
7928
|
-
LLM_NORM, cb, il);
|
7929
|
-
cb(tmpk, "tmpk", il);
|
7930
|
-
|
7931
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
7932
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
7933
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
7934
|
-
ggml_element_size(tmpq) * n_embd_head,
|
7935
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
7936
|
-
0
|
7937
|
-
);
|
7938
|
-
cb(qrot, "qrot", il);
|
7939
|
-
|
7940
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
7941
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
7942
|
-
ggml_element_size(tmpk) * n_embd_head,
|
7943
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
7944
|
-
0
|
7945
|
-
);
|
7946
|
-
cb(krot, "krot", il);
|
7947
|
-
|
7948
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
7949
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
7950
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
7951
|
-
ggml_element_size(tmpq) * n_embd_head,
|
7952
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
7953
|
-
ggml_element_size(tmpq) * n_rot
|
7954
|
-
);
|
7955
|
-
cb(qpass, "qpass", il);
|
7956
|
-
|
7957
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
7958
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
7959
|
-
ggml_element_size(tmpk) * n_embd_head,
|
7960
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
7961
|
-
ggml_element_size(tmpk) * n_rot
|
7962
|
-
);
|
7963
|
-
cb(kpass, "kpass", il);
|
7964
|
-
|
7965
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
7966
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7967
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7968
|
-
);
|
7969
|
-
cb(qrotated, "qrotated", il);
|
7970
|
-
|
7971
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
7972
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7973
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7974
|
-
);
|
7975
|
-
cb(krotated, "krotated", il);
|
7976
|
-
|
7977
|
-
// ggml currently only supports concatenation on dim=2
|
7978
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
7979
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
7980
|
-
cb(qrotated, "qrotated", il);
|
7981
|
-
|
7982
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
7983
|
-
cb(krotated, "krotated", il);
|
7984
|
-
|
7985
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
7986
|
-
cb(qpass, "qpass", il);
|
7987
|
-
|
7988
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
7989
|
-
cb(kpass, "kpass", il);
|
7990
|
-
|
7991
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
7992
|
-
cb(Qcur, "Qcur", il);
|
7993
|
-
|
7994
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
7995
|
-
cb(Kcur, "Kcur", il);
|
7996
|
-
|
7997
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
7998
|
-
cb(Q, "Q", il);
|
7999
|
-
|
8000
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
8001
|
-
cb(Kcur, "Kcur", il);
|
8002
|
-
|
8003
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
8004
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
8005
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
8006
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
8007
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
8008
|
-
);
|
8009
|
-
cb(Vcur, "Vcur", il);
|
8010
|
-
|
8011
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8012
|
-
model.layers[il].wo, model.layers[il].bo,
|
8013
|
-
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8014
|
-
}
|
8015
|
-
|
8016
|
-
if (il == n_layer - 1) {
|
8017
|
-
// skip computing output for unused tokens
|
8018
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8019
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8020
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8021
|
-
}
|
8022
|
-
|
8023
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
8024
|
-
cb(ffn_inp, "ffn_inp", il);
|
8025
|
-
|
8026
|
-
// feed-forward network
|
8027
|
-
{
|
8028
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8029
|
-
model.layers[il].ffn_norm,
|
8030
|
-
model.layers[il].ffn_norm_b,
|
8031
|
-
LLM_NORM, cb, il);
|
8032
|
-
cb(cur, "ffn_norm", il);
|
8033
|
-
|
8034
|
-
cur = llm_build_ffn(ctx0, cur,
|
8035
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8036
|
-
NULL, NULL,
|
8037
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8038
|
-
NULL,
|
8039
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
8040
|
-
cb(cur, "ffn_out", il);
|
8041
|
-
}
|
8042
|
-
|
8043
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
8044
|
-
cb(cur, "l_out", il);
|
8045
|
-
|
8046
|
-
inpL = cur;
|
8047
|
-
}
|
8048
|
-
|
8049
|
-
cur = inpL;
|
8050
|
-
|
8051
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
8052
|
-
model.output_norm,
|
8053
|
-
model.output_norm_b,
|
8054
|
-
LLM_NORM, cb, -1);
|
8055
|
-
cb(cur, "result_norm", -1);
|
8056
|
-
|
8057
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8058
|
-
cb(cur, "result_output", -1);
|
8059
|
-
|
8060
|
-
ggml_build_forward_expand(gf, cur);
|
8061
|
-
|
8062
|
-
return gf;
|
8063
|
-
}
|
8064
|
-
|
8065
|
-
struct ggml_cgraph * build_refact() {
|
8066
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8067
|
-
|
8068
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8069
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8070
|
-
|
8071
|
-
struct ggml_tensor * cur;
|
8072
|
-
struct ggml_tensor * inpL;
|
8073
|
-
|
8074
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8075
|
-
|
8076
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8077
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8078
|
-
|
8079
|
-
// positions of the tokens in the KV cache
|
8080
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8081
|
-
|
8082
|
-
for (int il = 0; il < n_layer; ++il) {
|
8083
|
-
struct ggml_tensor * inpSA = inpL;
|
8084
|
-
|
8085
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
8086
|
-
model.layers[il].attn_norm, NULL,
|
8087
|
-
LLM_NORM_RMS, cb, il);
|
8088
|
-
cb(cur, "attn_norm", il);
|
8089
|
-
|
8090
|
-
// self-attention
|
8091
|
-
{
|
8092
|
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8093
|
-
cb(Qcur, "Qcur", il);
|
8094
|
-
|
8095
|
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8096
|
-
cb(Kcur, "Kcur", il);
|
8097
|
-
|
8098
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8099
|
-
cb(Vcur, "Vcur", il);
|
8100
|
-
|
8101
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8102
|
-
cb(Kcur, "Kcur", il);
|
8103
|
-
|
8104
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8105
|
-
cb(Qcur, "Qcur", il);
|
8106
|
-
|
8107
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8108
|
-
model.layers[il].wo, NULL,
|
8109
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8110
|
-
}
|
8111
|
-
|
8112
|
-
if (il == n_layer - 1) {
|
8113
|
-
// skip computing output for unused tokens
|
8114
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8115
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8116
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8117
|
-
}
|
8129
|
+
if (il == n_layer - 1) {
|
8130
|
+
// skip computing output for unused tokens
|
8131
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8132
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8133
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8134
|
+
}
|
8118
8135
|
|
8119
8136
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8120
8137
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -8168,8 +8185,11 @@ struct llm_build_context {
|
|
8168
8185
|
|
8169
8186
|
struct ggml_tensor * cur;
|
8170
8187
|
struct ggml_tensor * inpL;
|
8188
|
+
struct ggml_tensor * inp_pos = nullptr;
|
8171
8189
|
|
8172
|
-
|
8190
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8191
|
+
inp_pos = build_inp_pos();
|
8192
|
+
}
|
8173
8193
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
8174
8194
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
8175
8195
|
|
@@ -8200,13 +8220,26 @@ struct llm_build_context {
|
|
8200
8220
|
struct ggml_tensor * Vcur;
|
8201
8221
|
|
8202
8222
|
// self-attention
|
8203
|
-
if (model.arch == LLM_ARCH_BERT) {
|
8223
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8204
8224
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
8205
8225
|
cb(Qcur, "Qcur", il);
|
8206
8226
|
|
8227
|
+
if (model.layers[il].attn_q_norm) {
|
8228
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8229
|
+
model.layers[il].attn_q_norm,
|
8230
|
+
model.layers[il].attn_q_norm_b,
|
8231
|
+
LLM_NORM, cb, il);
|
8232
|
+
}
|
8233
|
+
|
8207
8234
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
8208
8235
|
cb(Kcur, "Kcur", il);
|
8209
8236
|
|
8237
|
+
if (model.layers[il].attn_k_norm) {
|
8238
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8239
|
+
model.layers[il].attn_k_norm,
|
8240
|
+
model.layers[il].attn_k_norm_b,
|
8241
|
+
LLM_NORM, cb, il);
|
8242
|
+
}
|
8210
8243
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
8211
8244
|
cb(Vcur, "Vcur", il);
|
8212
8245
|
|
@@ -8225,15 +8258,15 @@ struct llm_build_context {
|
|
8225
8258
|
cb(Kcur, "Kcur", il);
|
8226
8259
|
cb(Vcur, "Vcur", il);
|
8227
8260
|
|
8228
|
-
Qcur =
|
8229
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8261
|
+
Qcur = ggml_rope_ext(
|
8262
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8230
8263
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8231
8264
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8232
8265
|
);
|
8233
8266
|
cb(Qcur, "Qcur", il);
|
8234
8267
|
|
8235
|
-
Kcur =
|
8236
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8268
|
+
Kcur = ggml_rope_ext(
|
8269
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8237
8270
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8238
8271
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8239
8272
|
);
|
@@ -8246,7 +8279,7 @@ struct llm_build_context {
|
|
8246
8279
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
8247
8280
|
cb(kq, "kq", il);
|
8248
8281
|
|
8249
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
8282
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
8250
8283
|
cb(kq, "kq_soft_max_ext", il);
|
8251
8284
|
|
8252
8285
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
@@ -8297,6 +8330,13 @@ struct llm_build_context {
|
|
8297
8330
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8298
8331
|
NULL,
|
8299
8332
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8333
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
8334
|
+
cur = llm_build_ffn(ctx0, cur,
|
8335
|
+
model.layers[il].ffn_up, NULL,
|
8336
|
+
model.layers[il].ffn_gate, NULL,
|
8337
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8338
|
+
NULL,
|
8339
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
8300
8340
|
} else {
|
8301
8341
|
cur = llm_build_ffn(ctx0, cur,
|
8302
8342
|
model.layers[il].ffn_up, NULL,
|
@@ -8363,9 +8403,6 @@ struct llm_build_context {
|
|
8363
8403
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8364
8404
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8365
8405
|
|
8366
|
-
// positions of the tokens in the KV cache
|
8367
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8368
|
-
|
8369
8406
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
8370
8407
|
model.tok_norm,
|
8371
8408
|
model.tok_norm_b,
|
@@ -8399,7 +8436,7 @@ struct llm_build_context {
|
|
8399
8436
|
|
8400
8437
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8401
8438
|
model.layers[il].wo, model.layers[il].bo,
|
8402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8439
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8403
8440
|
}
|
8404
8441
|
|
8405
8442
|
if (il == n_layer - 1) {
|
@@ -8464,9 +8501,6 @@ struct llm_build_context {
|
|
8464
8501
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8465
8502
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8466
8503
|
|
8467
|
-
// positions of the tokens in the KV cache
|
8468
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
8469
|
-
|
8470
8504
|
if (model.pos_embd) {
|
8471
8505
|
// inp_pos - contains the positions
|
8472
8506
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
@@ -8530,13 +8564,13 @@ struct llm_build_context {
|
|
8530
8564
|
|
8531
8565
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8532
8566
|
model.layers[il].wo, model.layers[il].bo,
|
8533
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8567
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8534
8568
|
} else {
|
8535
8569
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8536
8570
|
|
8537
8571
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8538
8572
|
model.layers[il].wo, model.layers[il].bo,
|
8539
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8573
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8540
8574
|
}
|
8541
8575
|
}
|
8542
8576
|
|
@@ -8664,15 +8698,15 @@ struct llm_build_context {
|
|
8664
8698
|
}
|
8665
8699
|
|
8666
8700
|
|
8667
|
-
Qcur =
|
8668
|
-
ctx0, Qcur, inp_pos,
|
8701
|
+
Qcur = ggml_rope_ext(
|
8702
|
+
ctx0, Qcur, inp_pos, nullptr,
|
8669
8703
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8670
8704
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8671
8705
|
);
|
8672
8706
|
cb(Qcur, "Qcur", il);
|
8673
8707
|
|
8674
|
-
Kcur =
|
8675
|
-
ctx0, Kcur, inp_pos,
|
8708
|
+
Kcur = ggml_rope_ext(
|
8709
|
+
ctx0, Kcur, inp_pos, nullptr,
|
8676
8710
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8677
8711
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8678
8712
|
);
|
@@ -8680,7 +8714,7 @@ struct llm_build_context {
|
|
8680
8714
|
|
8681
8715
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8682
8716
|
model.layers[il].wo, NULL,
|
8683
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8717
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8684
8718
|
}
|
8685
8719
|
|
8686
8720
|
if (il == n_layer - 1) {
|
@@ -8784,21 +8818,21 @@ struct llm_build_context {
|
|
8784
8818
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8785
8819
|
|
8786
8820
|
// using mode = 2 for neox mode
|
8787
|
-
Qcur =
|
8788
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8821
|
+
Qcur = ggml_rope_ext(
|
8822
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8789
8823
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8790
8824
|
);
|
8791
8825
|
cb(Qcur, "Qcur", il);
|
8792
8826
|
|
8793
|
-
Kcur =
|
8794
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8827
|
+
Kcur = ggml_rope_ext(
|
8828
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8795
8829
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8796
8830
|
);
|
8797
8831
|
cb(Kcur, "Kcur", il);
|
8798
8832
|
|
8799
8833
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8800
8834
|
model.layers[il].wo, NULL,
|
8801
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8835
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8802
8836
|
}
|
8803
8837
|
|
8804
8838
|
if (il == n_layer - 1) {
|
@@ -8895,15 +8929,15 @@ struct llm_build_context {
|
|
8895
8929
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8896
8930
|
cb(Vcur, "Vcur", il);
|
8897
8931
|
|
8898
|
-
Qcur =
|
8899
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8932
|
+
Qcur = ggml_rope_ext(
|
8933
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8900
8934
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8901
8935
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8902
8936
|
);
|
8903
8937
|
cb(Qcur, "Qcur", il);
|
8904
8938
|
|
8905
|
-
Kcur =
|
8906
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8939
|
+
Kcur = ggml_rope_ext(
|
8940
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8907
8941
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8908
8942
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8909
8943
|
);
|
@@ -8911,7 +8945,7 @@ struct llm_build_context {
|
|
8911
8945
|
|
8912
8946
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8913
8947
|
model.layers[il].wo, model.layers[il].bo,
|
8914
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
8948
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8915
8949
|
}
|
8916
8950
|
|
8917
8951
|
if (il == n_layer - 1) {
|
@@ -9009,15 +9043,15 @@ struct llm_build_context {
|
|
9009
9043
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9010
9044
|
cb(Vcur, "Vcur", il);
|
9011
9045
|
|
9012
|
-
Qcur =
|
9013
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9046
|
+
Qcur = ggml_rope_ext(
|
9047
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9014
9048
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9015
9049
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9016
9050
|
);
|
9017
9051
|
cb(Qcur, "Qcur", il);
|
9018
9052
|
|
9019
|
-
Kcur =
|
9020
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9053
|
+
Kcur = ggml_rope_ext(
|
9054
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9021
9055
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9022
9056
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9023
9057
|
);
|
@@ -9025,7 +9059,7 @@ struct llm_build_context {
|
|
9025
9059
|
|
9026
9060
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9027
9061
|
model.layers[il].wo, model.layers[il].bo,
|
9028
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9062
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9029
9063
|
}
|
9030
9064
|
|
9031
9065
|
if (il == n_layer - 1) {
|
@@ -9161,8 +9195,8 @@ struct llm_build_context {
|
|
9161
9195
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9162
9196
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9163
9197
|
|
9164
|
-
Qcur =
|
9165
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9198
|
+
Qcur = ggml_rope_ext(
|
9199
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9166
9200
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9167
9201
|
);
|
9168
9202
|
cb(Qcur, "Qcur", il);
|
@@ -9172,15 +9206,15 @@ struct llm_build_context {
|
|
9172
9206
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9173
9207
|
cb(Qcur, "Qcur", il);
|
9174
9208
|
|
9175
|
-
Kcur =
|
9176
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9209
|
+
Kcur = ggml_rope_ext(
|
9210
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9177
9211
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9178
9212
|
);
|
9179
9213
|
cb(Kcur, "Kcur", il);
|
9180
9214
|
|
9181
9215
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9182
9216
|
model.layers[il].wo, model.layers[il].bo,
|
9183
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9217
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9184
9218
|
}
|
9185
9219
|
|
9186
9220
|
if (il == n_layer - 1) {
|
@@ -9249,6 +9283,9 @@ struct llm_build_context {
|
|
9249
9283
|
|
9250
9284
|
// self-attention
|
9251
9285
|
{
|
9286
|
+
// rope freq factors for 128k context
|
9287
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
9288
|
+
|
9252
9289
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9253
9290
|
model.layers[il].attn_norm,
|
9254
9291
|
NULL,
|
@@ -9280,8 +9317,8 @@ struct llm_build_context {
|
|
9280
9317
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9281
9318
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9282
9319
|
|
9283
|
-
Qcur =
|
9284
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9320
|
+
Qcur = ggml_rope_ext(
|
9321
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9285
9322
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9286
9323
|
);
|
9287
9324
|
cb(Qcur, "Qcur", il);
|
@@ -9289,15 +9326,15 @@ struct llm_build_context {
|
|
9289
9326
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9290
9327
|
cb(Qcur, "Qcur", il);
|
9291
9328
|
|
9292
|
-
Kcur =
|
9293
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9329
|
+
Kcur = ggml_rope_ext(
|
9330
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9294
9331
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9295
9332
|
);
|
9296
9333
|
cb(Kcur, "Kcur", il);
|
9297
9334
|
|
9298
9335
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9299
9336
|
model.layers[il].wo, model.layers[il].bo,
|
9300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9337
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9301
9338
|
}
|
9302
9339
|
|
9303
9340
|
if (il == n_layer - 1) {
|
@@ -9396,21 +9433,21 @@ struct llm_build_context {
|
|
9396
9433
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9397
9434
|
cb(Vcur, "Vcur", il);
|
9398
9435
|
|
9399
|
-
Qcur =
|
9400
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
9436
|
+
Qcur = ggml_rope_ext(
|
9437
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9401
9438
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9402
9439
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9403
9440
|
cb(Qcur, "Qcur", il);
|
9404
9441
|
|
9405
|
-
Kcur =
|
9406
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
9442
|
+
Kcur = ggml_rope_ext(
|
9443
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9407
9444
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9408
9445
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9409
9446
|
cb(Kcur, "Kcur", il);
|
9410
9447
|
|
9411
9448
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9412
9449
|
model.layers[il].wo, NULL,
|
9413
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9450
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9414
9451
|
}
|
9415
9452
|
struct ggml_tensor * sa_out = cur;
|
9416
9453
|
|
@@ -9513,7 +9550,7 @@ struct llm_build_context {
|
|
9513
9550
|
|
9514
9551
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9515
9552
|
model.layers[il].wo, model.layers[il].bo,
|
9516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9553
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9517
9554
|
}
|
9518
9555
|
|
9519
9556
|
if (il == n_layer - 1) {
|
@@ -9604,15 +9641,15 @@ struct llm_build_context {
|
|
9604
9641
|
cb(tmpk, "tmpk", il);
|
9605
9642
|
cb(Vcur, "Vcur", il);
|
9606
9643
|
|
9607
|
-
struct ggml_tensor * Qcur =
|
9608
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
9644
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9645
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9609
9646
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9610
9647
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9611
9648
|
);
|
9612
9649
|
cb(Qcur, "Qcur", il);
|
9613
9650
|
|
9614
|
-
struct ggml_tensor * Kcur =
|
9615
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9651
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9652
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9616
9653
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9617
9654
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9618
9655
|
);
|
@@ -9620,7 +9657,7 @@ struct llm_build_context {
|
|
9620
9657
|
|
9621
9658
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9622
9659
|
model.layers[il].wo, model.layers[il].bo,
|
9623
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9660
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9624
9661
|
}
|
9625
9662
|
|
9626
9663
|
if (il == n_layer - 1) {
|
@@ -9720,15 +9757,15 @@ struct llm_build_context {
|
|
9720
9757
|
// cb(Vcur, "Vcur", il);
|
9721
9758
|
// }
|
9722
9759
|
|
9723
|
-
Qcur =
|
9724
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9760
|
+
Qcur = ggml_rope_ext(
|
9761
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9725
9762
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9726
9763
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9727
9764
|
);
|
9728
9765
|
cb(Qcur, "Qcur", il);
|
9729
9766
|
|
9730
|
-
Kcur =
|
9731
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9767
|
+
Kcur = ggml_rope_ext(
|
9768
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9732
9769
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9733
9770
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9734
9771
|
);
|
@@ -9736,7 +9773,7 @@ struct llm_build_context {
|
|
9736
9773
|
|
9737
9774
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9738
9775
|
model.layers[il].wo, NULL,
|
9739
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9776
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9740
9777
|
}
|
9741
9778
|
|
9742
9779
|
if (il == n_layer - 1) {
|
@@ -9837,15 +9874,15 @@ struct llm_build_context {
|
|
9837
9874
|
cb(Vcur, "Vcur", il);
|
9838
9875
|
}
|
9839
9876
|
|
9840
|
-
Qcur =
|
9841
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9877
|
+
Qcur = ggml_rope_ext(
|
9878
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9842
9879
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9843
9880
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9844
9881
|
);
|
9845
9882
|
cb(Qcur, "Qcur", il);
|
9846
9883
|
|
9847
|
-
Kcur =
|
9848
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9884
|
+
Kcur = ggml_rope_ext(
|
9885
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9849
9886
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9850
9887
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9851
9888
|
);
|
@@ -9853,7 +9890,7 @@ struct llm_build_context {
|
|
9853
9890
|
|
9854
9891
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9855
9892
|
model.layers[il].wo, model.layers[il].bo,
|
9856
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
9893
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9857
9894
|
}
|
9858
9895
|
|
9859
9896
|
if (il == n_layer - 1) {
|
@@ -9967,15 +10004,15 @@ struct llm_build_context {
|
|
9967
10004
|
cb(Vcur, "Vcur", il);
|
9968
10005
|
}
|
9969
10006
|
|
9970
|
-
Qcur =
|
9971
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10007
|
+
Qcur = ggml_rope_ext(
|
10008
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9972
10009
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9973
10010
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9974
10011
|
);
|
9975
10012
|
cb(Qcur, "Qcur", il);
|
9976
10013
|
|
9977
|
-
Kcur =
|
9978
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10014
|
+
Kcur = ggml_rope_ext(
|
10015
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9979
10016
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9980
10017
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9981
10018
|
);
|
@@ -9983,7 +10020,7 @@ struct llm_build_context {
|
|
9983
10020
|
|
9984
10021
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9985
10022
|
model.layers[il].wo, model.layers[il].bo,
|
9986
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10023
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9987
10024
|
}
|
9988
10025
|
|
9989
10026
|
if (il == n_layer - 1) {
|
@@ -10087,8 +10124,8 @@ struct llm_build_context {
|
|
10087
10124
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10088
10125
|
cb(Vcur, "Vcur", il);
|
10089
10126
|
|
10090
|
-
Qcur =
|
10091
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
10127
|
+
Qcur = ggml_rope_ext(
|
10128
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10092
10129
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10093
10130
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10094
10131
|
cb(Qcur, "Qcur", il);
|
@@ -10096,15 +10133,15 @@ struct llm_build_context {
|
|
10096
10133
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
10097
10134
|
cb(Qcur, "Qcur_scaled", il);
|
10098
10135
|
|
10099
|
-
Kcur =
|
10100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
10136
|
+
Kcur = ggml_rope_ext(
|
10137
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10101
10138
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10102
10139
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10103
10140
|
cb(Kcur, "Kcur", il);
|
10104
10141
|
|
10105
10142
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10106
10143
|
model.layers[il].wo, NULL,
|
10107
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10144
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10108
10145
|
}
|
10109
10146
|
|
10110
10147
|
if (il == n_layer - 1) {
|
@@ -10207,15 +10244,15 @@ struct llm_build_context {
|
|
10207
10244
|
cb(Vcur, "Vcur", il);
|
10208
10245
|
}
|
10209
10246
|
|
10210
|
-
Qcur =
|
10211
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10247
|
+
Qcur = ggml_rope_ext(
|
10248
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10212
10249
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10213
10250
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10214
10251
|
);
|
10215
10252
|
cb(Qcur, "Qcur", il);
|
10216
10253
|
|
10217
|
-
Kcur =
|
10218
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10254
|
+
Kcur = ggml_rope_ext(
|
10255
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10219
10256
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10220
10257
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10221
10258
|
);
|
@@ -10223,7 +10260,7 @@ struct llm_build_context {
|
|
10223
10260
|
|
10224
10261
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10225
10262
|
model.layers[il].wo, model.layers[il].bo,
|
10226
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10263
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10227
10264
|
}
|
10228
10265
|
|
10229
10266
|
if (il == n_layer - 1) {
|
@@ -10490,22 +10527,267 @@ struct llm_build_context {
|
|
10490
10527
|
LLM_NORM, cb, il);
|
10491
10528
|
cb(Qcur, "Qcur", il);
|
10492
10529
|
|
10493
|
-
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
10494
|
-
model.layers[il].attn_k_norm,
|
10495
|
-
NULL,
|
10496
|
-
LLM_NORM, cb, il);
|
10497
|
-
cb(Kcur, "Kcur", il);
|
10498
|
-
}
|
10530
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
10531
|
+
model.layers[il].attn_k_norm,
|
10532
|
+
NULL,
|
10533
|
+
LLM_NORM, cb, il);
|
10534
|
+
cb(Kcur, "Kcur", il);
|
10535
|
+
}
|
10536
|
+
|
10537
|
+
Qcur = ggml_rope_ext(
|
10538
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10539
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10540
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10541
|
+
);
|
10542
|
+
cb(Qcur, "Qcur", il);
|
10543
|
+
|
10544
|
+
Kcur = ggml_rope_ext(
|
10545
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10546
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10547
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10548
|
+
);
|
10549
|
+
cb(Kcur, "Kcur", il);
|
10550
|
+
|
10551
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10552
|
+
model.layers[il].wo, model.layers[il].bo,
|
10553
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10554
|
+
}
|
10555
|
+
|
10556
|
+
if (il == n_layer - 1) {
|
10557
|
+
// skip computing output for unused tokens
|
10558
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10559
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10560
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10561
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
10562
|
+
}
|
10563
|
+
|
10564
|
+
struct ggml_tensor * attn_out = cur;
|
10565
|
+
|
10566
|
+
// feed-forward network
|
10567
|
+
{
|
10568
|
+
cur = llm_build_ffn(ctx0, ffn_inp,
|
10569
|
+
model.layers[il].ffn_up, NULL,
|
10570
|
+
model.layers[il].ffn_gate, NULL,
|
10571
|
+
model.layers[il].ffn_down, NULL,
|
10572
|
+
NULL,
|
10573
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10574
|
+
cb(cur, "ffn_out", il);
|
10575
|
+
}
|
10576
|
+
|
10577
|
+
// add together residual + FFN + self-attention
|
10578
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10579
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
10580
|
+
cb(cur, "l_out", il);
|
10581
|
+
|
10582
|
+
// input for next layer
|
10583
|
+
inpL = cur;
|
10584
|
+
}
|
10585
|
+
|
10586
|
+
cur = inpL;
|
10587
|
+
|
10588
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10589
|
+
model.output_norm, NULL,
|
10590
|
+
LLM_NORM, cb, -1);
|
10591
|
+
cb(cur, "result_norm", -1);
|
10592
|
+
|
10593
|
+
// lm_head
|
10594
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10595
|
+
|
10596
|
+
if (f_logit_scale) {
|
10597
|
+
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
10598
|
+
}
|
10599
|
+
|
10600
|
+
cb(cur, "result_output", -1);
|
10601
|
+
|
10602
|
+
ggml_build_forward_expand(gf, cur);
|
10603
|
+
|
10604
|
+
return gf;
|
10605
|
+
|
10606
|
+
}
|
10607
|
+
|
10608
|
+
// ref: https://allenai.org/olmo
|
10609
|
+
// based on the original build_llama() function, changes:
|
10610
|
+
// * non-parametric layer norm
|
10611
|
+
// * clamp qkv
|
10612
|
+
// * removed bias
|
10613
|
+
// * removed MoE
|
10614
|
+
struct ggml_cgraph * build_olmo() {
|
10615
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10616
|
+
|
10617
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10618
|
+
int32_t n_tokens = this->n_tokens;
|
10619
|
+
|
10620
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10621
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10622
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10623
|
+
|
10624
|
+
struct ggml_tensor * cur;
|
10625
|
+
struct ggml_tensor * inpL;
|
10626
|
+
|
10627
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10628
|
+
|
10629
|
+
// inp_pos - contains the positions
|
10630
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10631
|
+
|
10632
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10633
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10634
|
+
|
10635
|
+
for (int il = 0; il < n_layer; ++il) {
|
10636
|
+
struct ggml_tensor * inpSA = inpL;
|
10637
|
+
|
10638
|
+
// norm
|
10639
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10640
|
+
NULL, NULL,
|
10641
|
+
LLM_NORM, cb, il);
|
10642
|
+
cb(cur, "attn_norm", il);
|
10643
|
+
|
10644
|
+
// self-attention
|
10645
|
+
{
|
10646
|
+
// compute Q and K and RoPE them
|
10647
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10648
|
+
cb(Qcur, "Qcur", il);
|
10649
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10650
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10651
|
+
cb(Qcur, "Qcur", il);
|
10652
|
+
}
|
10653
|
+
|
10654
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10655
|
+
cb(Kcur, "Kcur", il);
|
10656
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10657
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10658
|
+
cb(Kcur, "Kcur", il);
|
10659
|
+
}
|
10660
|
+
|
10661
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10662
|
+
cb(Vcur, "Vcur", il);
|
10663
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10664
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10665
|
+
cb(Vcur, "Vcur", il);
|
10666
|
+
}
|
10667
|
+
|
10668
|
+
Qcur = ggml_rope_ext(
|
10669
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10670
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10671
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10672
|
+
);
|
10673
|
+
cb(Qcur, "Qcur", il);
|
10674
|
+
|
10675
|
+
Kcur = ggml_rope_ext(
|
10676
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10677
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10678
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10679
|
+
);
|
10680
|
+
cb(Kcur, "Kcur", il);
|
10681
|
+
|
10682
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10683
|
+
model.layers[il].wo, nullptr,
|
10684
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10685
|
+
}
|
10686
|
+
|
10687
|
+
if (il == n_layer - 1) {
|
10688
|
+
// skip computing output for unused tokens
|
10689
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10690
|
+
n_tokens = n_outputs;
|
10691
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10692
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10693
|
+
}
|
10694
|
+
|
10695
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10696
|
+
cb(ffn_inp, "ffn_inp", il);
|
10697
|
+
|
10698
|
+
// feed-forward network
|
10699
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10700
|
+
NULL, NULL,
|
10701
|
+
LLM_NORM, cb, il);
|
10702
|
+
cb(cur, "ffn_norm", il);
|
10703
|
+
|
10704
|
+
cur = llm_build_ffn(ctx0, cur,
|
10705
|
+
model.layers[il].ffn_up, NULL,
|
10706
|
+
model.layers[il].ffn_gate, NULL,
|
10707
|
+
model.layers[il].ffn_down, NULL,
|
10708
|
+
NULL,
|
10709
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10710
|
+
cb(cur, "ffn_out", il);
|
10711
|
+
|
10712
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10713
|
+
cb(cur, "ffn_out", il);
|
10714
|
+
|
10715
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10716
|
+
if (layer_dir != nullptr) {
|
10717
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10718
|
+
}
|
10719
|
+
cb(cur, "l_out", il);
|
10720
|
+
|
10721
|
+
// input for next layer
|
10722
|
+
inpL = cur;
|
10723
|
+
}
|
10724
|
+
|
10725
|
+
cur = inpL;
|
10726
|
+
|
10727
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10728
|
+
NULL, NULL,
|
10729
|
+
LLM_NORM, cb, -1);
|
10730
|
+
cb(cur, "result_norm", -1);
|
10731
|
+
|
10732
|
+
// lm_head
|
10733
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10734
|
+
cb(cur, "result_output", -1);
|
10735
|
+
|
10736
|
+
ggml_build_forward_expand(gf, cur);
|
10737
|
+
|
10738
|
+
return gf;
|
10739
|
+
}
|
10740
|
+
|
10741
|
+
struct ggml_cgraph * build_gptneox() {
|
10742
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10743
|
+
|
10744
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10745
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
10746
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10747
|
+
|
10748
|
+
struct ggml_tensor * cur;
|
10749
|
+
struct ggml_tensor * inpL;
|
10750
|
+
|
10751
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10752
|
+
|
10753
|
+
// inp_pos - contains the positions
|
10754
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10755
|
+
|
10756
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10757
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10758
|
+
|
10759
|
+
for (int il = 0; il < n_layer; ++il) {
|
10760
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10761
|
+
model.layers[il].attn_norm,
|
10762
|
+
model.layers[il].attn_norm_b,
|
10763
|
+
LLM_NORM, cb, il);
|
10764
|
+
cb(cur, "attn_norm", il);
|
10765
|
+
|
10766
|
+
// self-attention
|
10767
|
+
{
|
10768
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
10769
|
+
cb(cur, "wqkv", il);
|
10770
|
+
|
10771
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
10772
|
+
cb(cur, "bqkv", il);
|
10773
|
+
|
10774
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
10775
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10776
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10777
|
+
|
10778
|
+
cb(Qcur, "Qcur", il);
|
10779
|
+
cb(Kcur, "Kcur", il);
|
10780
|
+
cb(Vcur, "Vcur", il);
|
10499
10781
|
|
10500
|
-
Qcur =
|
10501
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10782
|
+
Qcur = ggml_rope_ext(
|
10783
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10502
10784
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10503
10785
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10504
10786
|
);
|
10505
10787
|
cb(Qcur, "Qcur", il);
|
10506
10788
|
|
10507
|
-
Kcur =
|
10508
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10789
|
+
Kcur = ggml_rope_ext(
|
10790
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10509
10791
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10510
10792
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10511
10793
|
);
|
@@ -10513,68 +10795,84 @@ struct llm_build_context {
|
|
10513
10795
|
|
10514
10796
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10515
10797
|
model.layers[il].wo, model.layers[il].bo,
|
10516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10798
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10517
10799
|
}
|
10518
10800
|
|
10519
10801
|
if (il == n_layer - 1) {
|
10520
10802
|
// skip computing output for unused tokens
|
10521
10803
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10522
|
-
cur
|
10523
|
-
inpL
|
10524
|
-
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
10804
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10805
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10525
10806
|
}
|
10526
10807
|
|
10527
|
-
|
10808
|
+
// ffn
|
10809
|
+
if (hparams.use_par_res) {
|
10810
|
+
// attention and ffn are computed in parallel
|
10811
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
10528
10812
|
|
10529
|
-
|
10530
|
-
|
10531
|
-
cur =
|
10532
|
-
model.layers[il].
|
10533
|
-
model.layers[il].
|
10534
|
-
|
10813
|
+
struct ggml_tensor * attn_out = cur;
|
10814
|
+
|
10815
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10816
|
+
model.layers[il].ffn_norm,
|
10817
|
+
model.layers[il].ffn_norm_b,
|
10818
|
+
LLM_NORM, cb, il);
|
10819
|
+
cb(cur, "ffn_norm", il);
|
10820
|
+
|
10821
|
+
cur = llm_build_ffn(ctx0, cur,
|
10822
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10823
|
+
NULL, NULL,
|
10824
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10535
10825
|
NULL,
|
10536
|
-
|
10826
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10537
10827
|
cb(cur, "ffn_out", il);
|
10538
|
-
}
|
10539
10828
|
|
10540
|
-
|
10541
|
-
|
10542
|
-
cur = ggml_add(ctx0, cur, attn_out);
|
10543
|
-
cb(cur, "l_out", il);
|
10829
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10830
|
+
cb(cur, "ffn_out", il);
|
10544
10831
|
|
10545
|
-
|
10546
|
-
|
10547
|
-
|
10832
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
10833
|
+
cb(inpL, "l_out", il);
|
10834
|
+
} else {
|
10835
|
+
// attention and ffn are computed sequentially
|
10836
|
+
// x = x + attn(ln1(x))
|
10837
|
+
// x = x + ffn(ln2(x))
|
10548
10838
|
|
10549
|
-
|
10839
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
10840
|
+
cb(ffn_inp, "ffn_inp", il);
|
10550
10841
|
|
10551
|
-
|
10552
|
-
|
10553
|
-
|
10554
|
-
|
10842
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10843
|
+
model.layers[il].ffn_norm,
|
10844
|
+
model.layers[il].ffn_norm_b,
|
10845
|
+
LLM_NORM, cb, il);
|
10846
|
+
cb(cur, "ffn_norm", il);
|
10555
10847
|
|
10556
|
-
|
10557
|
-
|
10848
|
+
cur = llm_build_ffn(ctx0, cur,
|
10849
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10850
|
+
NULL, NULL,
|
10851
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10852
|
+
NULL,
|
10853
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10854
|
+
cb(cur, "ffn_out", il);
|
10558
10855
|
|
10559
|
-
|
10560
|
-
|
10856
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
10857
|
+
cb(inpL, "l_out", il);
|
10858
|
+
}
|
10561
10859
|
}
|
10562
10860
|
|
10861
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10862
|
+
model.output_norm,
|
10863
|
+
model.output_norm_b,
|
10864
|
+
LLM_NORM, cb, -1);
|
10865
|
+
cb(cur, "result_norm", -1);
|
10866
|
+
|
10867
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10563
10868
|
cb(cur, "result_output", -1);
|
10564
10869
|
|
10565
10870
|
ggml_build_forward_expand(gf, cur);
|
10566
10871
|
|
10567
10872
|
return gf;
|
10568
|
-
|
10569
10873
|
}
|
10570
10874
|
|
10571
|
-
|
10572
|
-
// based on the original build_llama() function, changes:
|
10573
|
-
// * non-parametric layer norm
|
10574
|
-
// * clamp qkv
|
10575
|
-
// * removed bias
|
10576
|
-
// * removed MoE
|
10577
|
-
struct ggml_cgraph * build_olmo() {
|
10875
|
+
struct ggml_cgraph * build_arctic() {
|
10578
10876
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10579
10877
|
|
10580
10878
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -10600,8 +10898,8 @@ struct llm_build_context {
|
|
10600
10898
|
|
10601
10899
|
// norm
|
10602
10900
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
10603
|
-
|
10604
|
-
|
10901
|
+
model.layers[il].attn_norm, NULL,
|
10902
|
+
LLM_NORM_RMS, cb, il);
|
10605
10903
|
cb(cur, "attn_norm", il);
|
10606
10904
|
|
10607
10905
|
// self-attention
|
@@ -10609,42 +10907,30 @@ struct llm_build_context {
|
|
10609
10907
|
// compute Q and K and RoPE them
|
10610
10908
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10611
10909
|
cb(Qcur, "Qcur", il);
|
10612
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10613
|
-
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10614
|
-
cb(Qcur, "Qcur", il);
|
10615
|
-
}
|
10616
10910
|
|
10617
10911
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10618
10912
|
cb(Kcur, "Kcur", il);
|
10619
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10620
|
-
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10621
|
-
cb(Kcur, "Kcur", il);
|
10622
|
-
}
|
10623
10913
|
|
10624
10914
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10625
10915
|
cb(Vcur, "Vcur", il);
|
10626
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10627
|
-
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10628
|
-
cb(Vcur, "Vcur", il);
|
10629
|
-
}
|
10630
10916
|
|
10631
|
-
Qcur =
|
10632
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10917
|
+
Qcur = ggml_rope_ext(
|
10918
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10633
10919
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10634
10920
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10635
10921
|
);
|
10636
10922
|
cb(Qcur, "Qcur", il);
|
10637
10923
|
|
10638
|
-
Kcur =
|
10639
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10924
|
+
Kcur = ggml_rope_ext(
|
10925
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10640
10926
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10641
10927
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10642
10928
|
);
|
10643
10929
|
cb(Kcur, "Kcur", il);
|
10644
10930
|
|
10645
10931
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10646
|
-
model.layers[il].wo,
|
10647
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
10932
|
+
model.layers[il].wo, NULL,
|
10933
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10648
10934
|
}
|
10649
10935
|
|
10650
10936
|
if (il == n_layer - 1) {
|
@@ -10660,8 +10946,8 @@ struct llm_build_context {
|
|
10660
10946
|
|
10661
10947
|
// feed-forward network
|
10662
10948
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10663
|
-
|
10664
|
-
|
10949
|
+
model.layers[il].ffn_norm, NULL,
|
10950
|
+
LLM_NORM_RMS, cb, il);
|
10665
10951
|
cb(cur, "ffn_norm", il);
|
10666
10952
|
|
10667
10953
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -10672,7 +10958,26 @@ struct llm_build_context {
|
|
10672
10958
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10673
10959
|
cb(cur, "ffn_out", il);
|
10674
10960
|
|
10675
|
-
|
10961
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
10962
|
+
cb(ffn_out, "ffn_out", il);
|
10963
|
+
|
10964
|
+
// MoE
|
10965
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
10966
|
+
model.layers[il].ffn_norm_exps, NULL,
|
10967
|
+
LLM_NORM_RMS, cb, il);
|
10968
|
+
cb(cur, "ffn_norm_exps", il);
|
10969
|
+
|
10970
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
10971
|
+
model.layers[il].ffn_gate_inp,
|
10972
|
+
model.layers[il].ffn_up_exps,
|
10973
|
+
model.layers[il].ffn_gate_exps,
|
10974
|
+
model.layers[il].ffn_down_exps,
|
10975
|
+
n_expert, n_expert_used,
|
10976
|
+
LLM_FFN_SILU, true,
|
10977
|
+
cb, il);
|
10978
|
+
cb(cur, "ffn_moe_out", il);
|
10979
|
+
|
10980
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
10676
10981
|
cb(cur, "ffn_out", il);
|
10677
10982
|
|
10678
10983
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
@@ -10688,8 +10993,8 @@ struct llm_build_context {
|
|
10688
10993
|
cur = inpL;
|
10689
10994
|
|
10690
10995
|
cur = llm_build_norm(ctx0, cur, hparams,
|
10691
|
-
|
10692
|
-
|
10996
|
+
model.output_norm, NULL,
|
10997
|
+
LLM_NORM_RMS, cb, -1);
|
10693
10998
|
cb(cur, "result_norm", -1);
|
10694
10999
|
|
10695
11000
|
// lm_head
|
@@ -10816,15 +11121,12 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10816
11121
|
{
|
10817
11122
|
result = llm.build_starcoder();
|
10818
11123
|
} break;
|
10819
|
-
case LLM_ARCH_PERSIMMON:
|
10820
|
-
{
|
10821
|
-
result = llm.build_persimmon();
|
10822
|
-
} break;
|
10823
11124
|
case LLM_ARCH_REFACT:
|
10824
11125
|
{
|
10825
11126
|
result = llm.build_refact();
|
10826
11127
|
} break;
|
10827
11128
|
case LLM_ARCH_BERT:
|
11129
|
+
case LLM_ARCH_JINA_BERT_V2:
|
10828
11130
|
case LLM_ARCH_NOMIC_BERT:
|
10829
11131
|
{
|
10830
11132
|
result = llm.build_bert();
|
@@ -10913,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10913
11215
|
{
|
10914
11216
|
result = llm.build_olmo();
|
10915
11217
|
} break;
|
11218
|
+
case LLM_ARCH_GPTNEOX:
|
11219
|
+
{
|
11220
|
+
result = llm.build_gptneox();
|
11221
|
+
} break;
|
11222
|
+
case LLM_ARCH_ARCTIC:
|
11223
|
+
{
|
11224
|
+
result = llm.build_arctic();
|
11225
|
+
} break;
|
10916
11226
|
default:
|
10917
11227
|
GGML_ASSERT(false);
|
10918
11228
|
}
|
@@ -11032,11 +11342,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11032
11342
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
11033
11343
|
f = -INFINITY;
|
11034
11344
|
} else {
|
11035
|
-
|
11345
|
+
if (hparams.use_alibi) {
|
11346
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
11347
|
+
} else {
|
11348
|
+
f = 0.0f;
|
11349
|
+
}
|
11036
11350
|
}
|
11037
11351
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
11038
11352
|
}
|
11039
11353
|
}
|
11354
|
+
|
11355
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
11356
|
+
for (int j = 0; j < n_kv; ++j) {
|
11357
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
11358
|
+
}
|
11359
|
+
}
|
11040
11360
|
}
|
11041
11361
|
} else {
|
11042
11362
|
// when using kv cache, the mask needs to match the kv cache size
|
@@ -11055,7 +11375,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11055
11375
|
float f = -INFINITY;
|
11056
11376
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
11057
11377
|
if (batch.seq_id[i][s] == seq_id) {
|
11058
|
-
|
11378
|
+
if (hparams.use_alibi) {
|
11379
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
11380
|
+
} else {
|
11381
|
+
f = 0.0f;
|
11382
|
+
}
|
11059
11383
|
break;
|
11060
11384
|
}
|
11061
11385
|
}
|
@@ -11071,21 +11395,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11071
11395
|
}
|
11072
11396
|
}
|
11073
11397
|
|
11074
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11075
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11076
|
-
if (hparams.use_alibi) {
|
11077
|
-
const int64_t n_kv = kv_self.n;
|
11078
|
-
|
11079
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
11080
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
11081
|
-
|
11082
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
11083
|
-
|
11084
|
-
for (int i = 0; i < n_kv; ++i) {
|
11085
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
11086
|
-
}
|
11087
|
-
}
|
11088
|
-
|
11089
11398
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
11090
11399
|
const int64_t n_tokens = batch.n_tokens;
|
11091
11400
|
|
@@ -11259,11 +11568,6 @@ static void llama_graph_compute(
|
|
11259
11568
|
llama_context & lctx,
|
11260
11569
|
ggml_cgraph * gf,
|
11261
11570
|
int n_threads) {
|
11262
|
-
#ifdef GGML_USE_MPI
|
11263
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
11264
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
11265
|
-
#endif
|
11266
|
-
|
11267
11571
|
#ifdef GGML_USE_METAL
|
11268
11572
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
11269
11573
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -11278,10 +11582,6 @@ static void llama_graph_compute(
|
|
11278
11582
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11279
11583
|
|
11280
11584
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
11281
|
-
|
11282
|
-
#ifdef GGML_USE_MPI
|
11283
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
11284
|
-
#endif
|
11285
11585
|
}
|
11286
11586
|
|
11287
11587
|
// decode a batch of tokens by evaluating the transformer
|
@@ -11319,12 +11619,6 @@ static int llama_decode_internal(
|
|
11319
11619
|
}
|
11320
11620
|
lctx.n_queued_tokens += n_tokens_all;
|
11321
11621
|
|
11322
|
-
#ifdef GGML_USE_MPI
|
11323
|
-
// TODO: needs fix after #3228
|
11324
|
-
GGML_ASSERT(false && "not implemented");
|
11325
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
11326
|
-
#endif
|
11327
|
-
|
11328
11622
|
auto & kv_self = lctx.kv_self;
|
11329
11623
|
|
11330
11624
|
const int64_t n_embd = hparams.n_embd;
|
@@ -11455,7 +11749,8 @@ static int llama_decode_internal(
|
|
11455
11749
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11456
11750
|
// after enough generations, the benefit from this heuristic disappears
|
11457
11751
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11458
|
-
|
11752
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
11753
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
11459
11754
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11460
11755
|
}
|
11461
11756
|
}
|
@@ -12200,13 +12495,14 @@ struct llm_tokenizer_bpe {
|
|
12200
12495
|
|
12201
12496
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12202
12497
|
int final_prev_index = -1;
|
12498
|
+
bool ignore_merges = false;
|
12203
12499
|
|
12204
12500
|
std::vector<std::string> word_collection;
|
12205
12501
|
switch (vocab.type) {
|
12206
12502
|
case LLAMA_VOCAB_TYPE_BPE:
|
12207
12503
|
switch (vocab.type_pre) {
|
12208
12504
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12209
|
-
|
12505
|
+
ignore_merges = true;
|
12210
12506
|
word_collection = unicode_regex_split(text, {
|
12211
12507
|
// original regex from tokenizer.json
|
12212
12508
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12215,6 +12511,12 @@ struct llm_tokenizer_bpe {
|
|
12215
12511
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12216
12512
|
});
|
12217
12513
|
break;
|
12514
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12515
|
+
word_collection = unicode_regex_split(text, {
|
12516
|
+
// same as llama3
|
12517
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12518
|
+
});
|
12519
|
+
break;
|
12218
12520
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12219
12521
|
word_collection = unicode_regex_split(text, {
|
12220
12522
|
"[\r\n]",
|
@@ -12266,6 +12568,7 @@ struct llm_tokenizer_bpe {
|
|
12266
12568
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12267
12569
|
});
|
12268
12570
|
break;
|
12571
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
12269
12572
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12270
12573
|
word_collection = unicode_regex_split(text, {
|
12271
12574
|
// original regex from tokenizer.json
|
@@ -12298,6 +12601,11 @@ struct llm_tokenizer_bpe {
|
|
12298
12601
|
int index = 0;
|
12299
12602
|
size_t offset = 0;
|
12300
12603
|
|
12604
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
12605
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
12606
|
+
offset = word.size();
|
12607
|
+
}
|
12608
|
+
|
12301
12609
|
while (offset < word.size()) {
|
12302
12610
|
llm_symbol sym;
|
12303
12611
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
@@ -12483,16 +12791,16 @@ struct llm_tokenizer_wpm {
|
|
12483
12791
|
// to lowercase, pad chinese characters, pad punctuation
|
12484
12792
|
std::string new_str = "";
|
12485
12793
|
for (uint32_t code : cpts_nfd) {
|
12486
|
-
|
12487
|
-
if (
|
12794
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
12795
|
+
if (flags.is_accent_mark || flags.is_control) {
|
12488
12796
|
continue;
|
12489
12797
|
}
|
12490
12798
|
code = unicode_tolower(code);
|
12491
|
-
if (
|
12799
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
12492
12800
|
code = ' ';
|
12493
12801
|
}
|
12494
12802
|
std::string s = unicode_cpt_to_utf8(code);
|
12495
|
-
if (
|
12803
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12496
12804
|
new_str += " ";
|
12497
12805
|
new_str += s;
|
12498
12806
|
new_str += " ";
|
@@ -12695,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12695
13003
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12696
13004
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
12697
13005
|
|
13006
|
+
static const bool rtrim = true; //TODO: as param
|
13007
|
+
bool is_prev_special = false;
|
13008
|
+
bool special_token_rtrim = false;
|
13009
|
+
|
12698
13010
|
if (add_special && vocab.special_add_bos != 0) {
|
12699
13011
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12700
13012
|
output.push_back(vocab.special_bos_id);
|
13013
|
+
is_prev_special = true;
|
12701
13014
|
}
|
12702
13015
|
|
12703
13016
|
for (const auto & fragment : fragment_buffer) {
|
@@ -12709,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12709
13022
|
// and passing 'add space prefix' as bool argument
|
12710
13023
|
//
|
12711
13024
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
12712
|
-
|
12713
|
-
|
12714
|
-
|
13025
|
+
|
13026
|
+
if (special_token_rtrim) {
|
13027
|
+
size_t num_whitespaces = 0;
|
13028
|
+
while (isspace(raw_text[num_whitespaces])) {
|
13029
|
+
num_whitespaces++;
|
13030
|
+
}
|
13031
|
+
if (num_whitespaces == raw_text.size()) {
|
13032
|
+
continue; // skip if all whitespaces
|
13033
|
+
}
|
13034
|
+
raw_text = raw_text.substr(num_whitespaces);
|
13035
|
+
}
|
13036
|
+
|
13037
|
+
if (vocab.add_space_prefix) {
|
13038
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13039
|
+
raw_text = " " + raw_text;
|
12715
13040
|
}
|
12716
13041
|
}
|
12717
13042
|
|
@@ -12723,9 +13048,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12723
13048
|
tokenizer.tokenize(raw_text, output);
|
12724
13049
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
12725
13050
|
output.push_back(fragment.token);
|
13051
|
+
is_prev_special = true;
|
13052
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13053
|
+
special_token_rtrim = rtrim
|
13054
|
+
&& fragment.token != vocab.special_bos_id
|
13055
|
+
&& fragment.token != vocab.special_unk_id
|
13056
|
+
&& fragment.token != vocab.special_eos_id;
|
12726
13057
|
}
|
12727
13058
|
}
|
12728
13059
|
|
13060
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13061
|
+
LLAMA_LOG_WARN(
|
13062
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13063
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13064
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13065
|
+
}
|
13066
|
+
|
12729
13067
|
if (add_special && vocab.special_add_eos == 1) {
|
12730
13068
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
12731
13069
|
output.push_back(vocab.special_eos_id);
|
@@ -12752,7 +13090,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12752
13090
|
}
|
12753
13091
|
}
|
12754
13092
|
|
12755
|
-
|
13093
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13094
|
+
LLAMA_LOG_WARN(
|
13095
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13096
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13097
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13098
|
+
}
|
13099
|
+
|
13100
|
+
if (add_special && vocab.special_add_eos == 1) {
|
13101
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
13102
|
+
output.push_back(vocab.special_eos_id);
|
13103
|
+
}
|
12756
13104
|
} break;
|
12757
13105
|
case LLAMA_VOCAB_TYPE_WPM:
|
12758
13106
|
{
|
@@ -13106,6 +13454,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
13106
13454
|
return rejects;
|
13107
13455
|
}
|
13108
13456
|
|
13457
|
+
static bool llama_grammar_detect_left_recursion(
|
13458
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
13459
|
+
size_t rule_index,
|
13460
|
+
std::vector<bool> * rules_visited,
|
13461
|
+
std::vector<bool> * rules_in_progress,
|
13462
|
+
std::vector<bool> * rules_may_be_empty) {
|
13463
|
+
if ((*rules_in_progress)[rule_index]) {
|
13464
|
+
return true;
|
13465
|
+
}
|
13466
|
+
|
13467
|
+
(*rules_in_progress)[rule_index] = true;
|
13468
|
+
|
13469
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
13470
|
+
|
13471
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
13472
|
+
// step but it's more readable as two steps.
|
13473
|
+
bool at_rule_start = true;
|
13474
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13475
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13476
|
+
if (at_rule_start) {
|
13477
|
+
(*rules_may_be_empty)[rule_index] = true;
|
13478
|
+
break;
|
13479
|
+
}
|
13480
|
+
at_rule_start = true;
|
13481
|
+
} else {
|
13482
|
+
at_rule_start = false;
|
13483
|
+
}
|
13484
|
+
}
|
13485
|
+
|
13486
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
13487
|
+
// be empty)
|
13488
|
+
bool recurse_into_nonterminal = true;
|
13489
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
13490
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
13491
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
13492
|
+
return true;
|
13493
|
+
}
|
13494
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
13495
|
+
recurse_into_nonterminal = false;
|
13496
|
+
}
|
13497
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
13498
|
+
recurse_into_nonterminal = true;
|
13499
|
+
} else {
|
13500
|
+
recurse_into_nonterminal = false;
|
13501
|
+
}
|
13502
|
+
}
|
13503
|
+
|
13504
|
+
(*rules_in_progress)[rule_index] = false;
|
13505
|
+
(*rules_visited)[rule_index] = true;
|
13506
|
+
return false;
|
13507
|
+
}
|
13508
|
+
|
13109
13509
|
//
|
13110
13510
|
// grammar - external
|
13111
13511
|
//
|
@@ -13125,6 +13525,19 @@ struct llama_grammar * llama_grammar_init(
|
|
13125
13525
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
13126
13526
|
}
|
13127
13527
|
|
13528
|
+
// Check for left recursion
|
13529
|
+
std::vector<bool> rules_visited(n_rules);
|
13530
|
+
std::vector<bool> rules_in_progress(n_rules);
|
13531
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
13532
|
+
for (size_t i = 0; i < n_rules; i++) {
|
13533
|
+
if (rules_visited[i]) {
|
13534
|
+
continue;
|
13535
|
+
}
|
13536
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
13537
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
13538
|
+
}
|
13539
|
+
}
|
13540
|
+
|
13128
13541
|
// loop over alternates of start rule to build initial stacks
|
13129
13542
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
13130
13543
|
pos = vec_rules[start_rule_index].data();
|
@@ -13147,6 +13560,9 @@ struct llama_grammar * llama_grammar_init(
|
|
13147
13560
|
}
|
13148
13561
|
} while (true);
|
13149
13562
|
|
13563
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
13564
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
13565
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
13150
13566
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
13151
13567
|
}
|
13152
13568
|
|
@@ -13741,9 +14157,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13741
14157
|
|
13742
14158
|
// Sample the next word X using top-k sampling
|
13743
14159
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
13744
|
-
|
13745
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13746
|
-
}
|
14160
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13747
14161
|
llama_token X = llama_sample_token(ctx, candidates);
|
13748
14162
|
t_start_sample_us = ggml_time_us();
|
13749
14163
|
|
@@ -13757,9 +14171,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
13757
14171
|
// Update mu using the learning rate and error
|
13758
14172
|
*mu = *mu - eta * e;
|
13759
14173
|
|
13760
|
-
|
13761
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13762
|
-
}
|
14174
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
13763
14175
|
return X;
|
13764
14176
|
}
|
13765
14177
|
|
@@ -14344,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14344
14756
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
14345
14757
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
14346
14758
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
14347
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
14348
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
14349
14759
|
if (qs.model.type == MODEL_70B) {
|
14350
14760
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
14351
14761
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
@@ -15246,6 +15656,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15246
15656
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
15247
15657
|
/*.main_gpu =*/ 0,
|
15248
15658
|
/*.tensor_split =*/ nullptr,
|
15659
|
+
/*.rpc_servers =*/ nullptr,
|
15249
15660
|
/*.progress_callback =*/ nullptr,
|
15250
15661
|
/*.progress_callback_user_data =*/ nullptr,
|
15251
15662
|
/*.kv_overrides =*/ nullptr,
|
@@ -15316,7 +15727,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
15316
15727
|
}
|
15317
15728
|
|
15318
15729
|
size_t llama_max_devices(void) {
|
15319
|
-
#if defined(
|
15730
|
+
#if defined(GGML_USE_RPC)
|
15731
|
+
return GGML_RPC_MAX_SERVERS;
|
15732
|
+
#elif defined(GGML_USE_METAL)
|
15320
15733
|
return 1;
|
15321
15734
|
#elif defined(GGML_USE_CUDA)
|
15322
15735
|
return GGML_CUDA_MAX_DEVICES;
|
@@ -15339,7 +15752,7 @@ bool llama_supports_mlock(void) {
|
|
15339
15752
|
|
15340
15753
|
bool llama_supports_gpu_offload(void) {
|
15341
15754
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
15342
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
15755
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
15343
15756
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
15344
15757
|
return true;
|
15345
15758
|
#else
|
@@ -15356,10 +15769,6 @@ void llama_backend_init(void) {
|
|
15356
15769
|
struct ggml_context * ctx = ggml_init(params);
|
15357
15770
|
ggml_free(ctx);
|
15358
15771
|
}
|
15359
|
-
|
15360
|
-
#ifdef GGML_USE_MPI
|
15361
|
-
ggml_mpi_backend_init();
|
15362
|
-
#endif
|
15363
15772
|
}
|
15364
15773
|
|
15365
15774
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
@@ -15369,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
15369
15778
|
}
|
15370
15779
|
|
15371
15780
|
void llama_backend_free(void) {
|
15372
|
-
#ifdef GGML_USE_MPI
|
15373
|
-
ggml_mpi_backend_free();
|
15374
|
-
#endif
|
15375
15781
|
ggml_quantize_free();
|
15376
15782
|
}
|
15377
15783
|
|
@@ -15402,7 +15808,17 @@ struct llama_model * llama_load_model_from_file(
|
|
15402
15808
|
return true;
|
15403
15809
|
};
|
15404
15810
|
}
|
15405
|
-
|
15811
|
+
if (params.rpc_servers != nullptr) {
|
15812
|
+
// split the servers set them into model->rpc_servers
|
15813
|
+
std::string servers(params.rpc_servers);
|
15814
|
+
size_t pos = 0;
|
15815
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
15816
|
+
std::string server = servers.substr(0, pos);
|
15817
|
+
model->rpc_servers.push_back(server);
|
15818
|
+
servers.erase(0, pos + 1);
|
15819
|
+
}
|
15820
|
+
model->rpc_servers.push_back(servers);
|
15821
|
+
}
|
15406
15822
|
int status = llama_model_load(path_model, *model, params);
|
15407
15823
|
GGML_ASSERT(status <= 0);
|
15408
15824
|
if (status < 0) {
|
@@ -15441,6 +15857,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15441
15857
|
return nullptr;
|
15442
15858
|
}
|
15443
15859
|
|
15860
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15861
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15862
|
+
params.flash_attn = false;
|
15863
|
+
}
|
15864
|
+
|
15444
15865
|
llama_context * ctx = new llama_context(*model);
|
15445
15866
|
|
15446
15867
|
const auto & hparams = model->hparams;
|
@@ -15464,7 +15885,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15464
15885
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15465
15886
|
|
15466
15887
|
// this is necessary due to kv_self.n being padded later during inference
|
15467
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
15888
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
15468
15889
|
|
15469
15890
|
// with causal attention, the batch size is limited by the context size
|
15470
15891
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
@@ -15499,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15499
15920
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
15500
15921
|
}
|
15501
15922
|
|
15923
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
15502
15924
|
cparams.causal_attn = hparams.causal_attn;
|
15503
15925
|
|
15504
15926
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
@@ -15509,16 +15931,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15509
15931
|
}
|
15510
15932
|
}
|
15511
15933
|
|
15512
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
15513
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15514
|
-
cparams.flash_attn = false;
|
15515
|
-
}
|
15516
|
-
|
15517
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15518
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15519
|
-
cparams.flash_attn = false;
|
15520
|
-
}
|
15521
|
-
|
15522
15934
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15523
15935
|
params.seed = time(NULL);
|
15524
15936
|
}
|
@@ -15554,7 +15966,17 @@ struct llama_context * llama_new_context_with_model(
|
|
15554
15966
|
|
15555
15967
|
if (!hparams.vocab_only) {
|
15556
15968
|
// initialize backends
|
15557
|
-
#
|
15969
|
+
#if defined(GGML_USE_RPC)
|
15970
|
+
for (auto & server : model->rpc_servers) {
|
15971
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
15972
|
+
if (backend == nullptr) {
|
15973
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
15974
|
+
llama_free(ctx);
|
15975
|
+
return nullptr;
|
15976
|
+
}
|
15977
|
+
ctx->backends.push_back(backend);
|
15978
|
+
}
|
15979
|
+
#elif defined(GGML_USE_METAL)
|
15558
15980
|
if (model->n_gpu_layers > 0) {
|
15559
15981
|
ctx->backend_metal = ggml_backend_metal_init();
|
15560
15982
|
if (ctx->backend_metal == nullptr) {
|
@@ -15710,7 +16132,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15710
16132
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
15711
16133
|
|
15712
16134
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
15713
|
-
bool pipeline_parallel =
|
16135
|
+
bool pipeline_parallel =
|
16136
|
+
llama_get_device_count(*model) > 1 &&
|
16137
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
16138
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
16139
|
+
params.offload_kqv;
|
15714
16140
|
#ifndef GGML_USE_CUDA
|
15715
16141
|
// pipeline parallelism requires support for async compute and events
|
15716
16142
|
// currently this is only implemented in the CUDA backend
|
@@ -15753,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15753
16179
|
}
|
15754
16180
|
}
|
15755
16181
|
|
15756
|
-
#ifdef GGML_USE_MPI
|
15757
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
15758
|
-
|
15759
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
15760
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
15761
|
-
// TODO: needs fix after #3228
|
15762
|
-
GGML_ASSERT(false && "not implemented");
|
15763
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
15764
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
15765
|
-
llama_backend_free();
|
15766
|
-
exit(1);
|
15767
|
-
}
|
15768
|
-
#endif
|
15769
|
-
|
15770
16182
|
return ctx;
|
15771
16183
|
}
|
15772
16184
|
|
@@ -15803,11 +16215,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15803
16215
|
// these models do not use RoPE
|
15804
16216
|
case LLM_ARCH_GPT2:
|
15805
16217
|
case LLM_ARCH_GPTJ:
|
15806
|
-
case LLM_ARCH_GPTNEOX:
|
15807
16218
|
case LLM_ARCH_MPT:
|
15808
16219
|
case LLM_ARCH_REFACT:
|
15809
16220
|
case LLM_ARCH_BLOOM:
|
15810
16221
|
case LLM_ARCH_MAMBA:
|
16222
|
+
case LLM_ARCH_JINA_BERT_V2:
|
15811
16223
|
return LLAMA_ROPE_TYPE_NONE;
|
15812
16224
|
|
15813
16225
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -15822,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15822
16234
|
case LLM_ARCH_XVERSE:
|
15823
16235
|
case LLM_ARCH_COMMAND_R:
|
15824
16236
|
case LLM_ARCH_OLMO:
|
16237
|
+
case LLM_ARCH_ARCTIC:
|
15825
16238
|
return LLAMA_ROPE_TYPE_NORM;
|
15826
16239
|
|
15827
16240
|
// the pairs of head values are offset by n_rot/2
|
15828
16241
|
case LLM_ARCH_FALCON:
|
15829
16242
|
case LLM_ARCH_GROK:
|
15830
16243
|
case LLM_ARCH_DBRX:
|
15831
|
-
case LLM_ARCH_PERSIMMON:
|
15832
16244
|
case LLM_ARCH_BERT:
|
15833
16245
|
case LLM_ARCH_NOMIC_BERT:
|
15834
16246
|
case LLM_ARCH_STABLELM:
|
@@ -15839,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15839
16251
|
case LLM_ARCH_PHI3:
|
15840
16252
|
case LLM_ARCH_GEMMA:
|
15841
16253
|
case LLM_ARCH_STARCODER2:
|
16254
|
+
case LLM_ARCH_GPTNEOX:
|
15842
16255
|
return LLAMA_ROPE_TYPE_NEOX;
|
15843
16256
|
|
15844
16257
|
// all model arches should be listed explicitly here
|
@@ -15998,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
15998
16411
|
}
|
15999
16412
|
|
16000
16413
|
// make tensors
|
16414
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
16001
16415
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
16002
16416
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
16003
16417
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
@@ -16006,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16006
16420
|
}
|
16007
16421
|
|
16008
16422
|
// allocate tensors / buffers and zero
|
16423
|
+
cvec.ctxs.reserve(ctx_map.size());
|
16424
|
+
cvec.bufs.reserve(ctx_map.size());
|
16009
16425
|
for (auto it : ctx_map) {
|
16010
16426
|
ggml_backend_buffer_type_t buft = it.first;
|
16011
16427
|
ggml_context * ctx = it.second;
|
@@ -16829,13 +17245,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16829
17245
|
}
|
16830
17246
|
else {
|
16831
17247
|
if (cell_range_begin != kv_self.size) {
|
16832
|
-
cell_ranges.
|
17248
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
16833
17249
|
cell_range_begin = kv_self.size;
|
16834
17250
|
}
|
16835
17251
|
}
|
16836
17252
|
}
|
16837
17253
|
if (cell_range_begin != kv_self.size) {
|
16838
|
-
cell_ranges.
|
17254
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
16839
17255
|
}
|
16840
17256
|
|
16841
17257
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
@@ -17214,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
17214
17630
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
17215
17631
|
}
|
17216
17632
|
|
17633
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
17634
|
+
return ctx->cparams.n_threads;
|
17635
|
+
}
|
17636
|
+
|
17637
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
17638
|
+
return ctx->cparams.n_threads_batch;
|
17639
|
+
}
|
17640
|
+
|
17217
17641
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
17218
17642
|
ctx->abort_callback = abort_callback;
|
17219
17643
|
ctx->abort_callback_data = abort_callback_data;
|
@@ -17648,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
17648
18072
|
}
|
17649
18073
|
}
|
17650
18074
|
// llama2 templates seem to not care about "add_generation_prompt"
|
18075
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
18076
|
+
// Phi 3
|
18077
|
+
for (auto message : chat) {
|
18078
|
+
std::string role(message->role);
|
18079
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
18080
|
+
}
|
18081
|
+
if (add_ass) {
|
18082
|
+
ss << "<|assistant|>\n";
|
18083
|
+
}
|
17651
18084
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
17652
18085
|
// zephyr template
|
17653
18086
|
for (auto message : chat) {
|
@@ -17780,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
17780
18213
|
if (add_ass) {
|
17781
18214
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17782
18215
|
}
|
17783
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17784
|
-
// Phi 3
|
17785
|
-
for (auto message : chat) {
|
17786
|
-
std::string role(message->role);
|
17787
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17788
|
-
}
|
17789
|
-
if (add_ass) {
|
17790
|
-
ss << "<|assistant|>\n";
|
17791
|
-
}
|
17792
18216
|
} else {
|
17793
18217
|
// template not supported
|
17794
18218
|
return -1;
|
@@ -17910,6 +18334,7 @@ const char * llama_print_system_info(void) {
|
|
17910
18334
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
17911
18335
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
17912
18336
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
18337
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
17913
18338
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
17914
18339
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
17915
18340
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
@@ -17970,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
17970
18395
|
g_state.log_callback_user_data = user_data;
|
17971
18396
|
#ifdef GGML_USE_METAL
|
17972
18397
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18398
|
+
#elif defined(GGML_USE_CUDA)
|
18399
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
17973
18400
|
#endif
|
17974
18401
|
}
|
17975
18402
|
|