llama_cpp 0.14.5 → 0.14.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +18 -6
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +153 -87
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +885 -144
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -105,7 +105,7 @@
|
|
105
105
|
#endif
|
106
106
|
|
107
107
|
#define LLAMA_MAX_NODES 8192
|
108
|
-
#define LLAMA_MAX_EXPERTS
|
108
|
+
#define LLAMA_MAX_EXPERTS 60
|
109
109
|
|
110
110
|
|
111
111
|
//
|
@@ -209,6 +209,7 @@ enum llm_arch {
|
|
209
209
|
LLM_ARCH_STABLELM,
|
210
210
|
LLM_ARCH_QWEN,
|
211
211
|
LLM_ARCH_QWEN2,
|
212
|
+
LLM_ARCH_QWEN2MOE,
|
212
213
|
LLM_ARCH_PHI2,
|
213
214
|
LLM_ARCH_PLAMO,
|
214
215
|
LLM_ARCH_CODESHELL,
|
@@ -220,6 +221,8 @@ enum llm_arch {
|
|
220
221
|
LLM_ARCH_MAMBA,
|
221
222
|
LLM_ARCH_XVERSE,
|
222
223
|
LLM_ARCH_COMMAND_R,
|
224
|
+
LLM_ARCH_DBRX,
|
225
|
+
LLM_ARCH_OLMO,
|
223
226
|
LLM_ARCH_UNKNOWN,
|
224
227
|
};
|
225
228
|
|
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
241
244
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
242
245
|
{ LLM_ARCH_QWEN, "qwen" },
|
243
246
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
244
248
|
{ LLM_ARCH_PHI2, "phi2" },
|
245
249
|
{ LLM_ARCH_PLAMO, "plamo" },
|
246
250
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
256
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
257
|
{ LLM_ARCH_XVERSE, "xverse" },
|
254
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
255
261
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
256
262
|
};
|
257
263
|
|
@@ -325,6 +331,10 @@ enum llm_kv {
|
|
325
331
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
326
332
|
LLM_KV_TOKENIZER_HF_JSON,
|
327
333
|
LLM_KV_TOKENIZER_RWKV,
|
334
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
335
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
336
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
337
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
328
338
|
};
|
329
339
|
|
330
340
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -397,6 +407,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
397
407
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
398
408
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
399
409
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
410
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
411
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
412
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
413
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
400
414
|
};
|
401
415
|
|
402
416
|
struct LLM_KV {
|
@@ -427,6 +441,7 @@ enum llm_tensor {
|
|
427
441
|
LLM_TENSOR_ATTN_OUT_NORM,
|
428
442
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
429
443
|
LLM_TENSOR_FFN_GATE_INP,
|
444
|
+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
430
445
|
LLM_TENSOR_FFN_NORM,
|
431
446
|
LLM_TENSOR_FFN_GATE,
|
432
447
|
LLM_TENSOR_FFN_DOWN,
|
@@ -438,6 +453,9 @@ enum llm_tensor {
|
|
438
453
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
439
454
|
LLM_TENSOR_FFN_GATE_EXPS,
|
440
455
|
LLM_TENSOR_FFN_UP_EXPS,
|
456
|
+
LLM_TENSOR_FFN_DOWN_SHEXP,
|
457
|
+
LLM_TENSOR_FFN_GATE_SHEXP,
|
458
|
+
LLM_TENSOR_FFN_UP_SHEXP,
|
441
459
|
LLM_TENSOR_ATTN_Q_NORM,
|
442
460
|
LLM_TENSOR_ATTN_K_NORM,
|
443
461
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -700,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
700
718
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
701
719
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
702
720
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
721
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
722
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
703
723
|
},
|
704
724
|
},
|
705
725
|
{
|
@@ -735,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
735
755
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
736
756
|
},
|
737
757
|
},
|
758
|
+
{
|
759
|
+
LLM_ARCH_QWEN2MOE,
|
760
|
+
{
|
761
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
762
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
763
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
764
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
765
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
766
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
767
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
768
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
769
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
770
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
771
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
772
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
773
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
774
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
777
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
778
|
+
},
|
779
|
+
},
|
738
780
|
{
|
739
781
|
LLM_ARCH_PHI2,
|
740
782
|
{
|
@@ -934,6 +976,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
934
976
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
935
977
|
},
|
936
978
|
},
|
979
|
+
{
|
980
|
+
LLM_ARCH_DBRX,
|
981
|
+
{
|
982
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
983
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
984
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
985
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
986
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
987
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
988
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
989
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
990
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
991
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
992
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
993
|
+
},
|
994
|
+
},
|
995
|
+
{
|
996
|
+
LLM_ARCH_OLMO,
|
997
|
+
{
|
998
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
999
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1000
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1001
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1002
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1003
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1004
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1005
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1006
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1007
|
+
},
|
1008
|
+
},
|
937
1009
|
{
|
938
1010
|
LLM_ARCH_UNKNOWN,
|
939
1011
|
{
|
@@ -1690,6 +1762,7 @@ enum e_model {
|
|
1690
1762
|
MODEL_4B,
|
1691
1763
|
MODEL_7B,
|
1692
1764
|
MODEL_8B,
|
1765
|
+
MODEL_12B,
|
1693
1766
|
MODEL_13B,
|
1694
1767
|
MODEL_14B,
|
1695
1768
|
MODEL_15B,
|
@@ -1705,8 +1778,10 @@ enum e_model {
|
|
1705
1778
|
MODEL_MEDIUM,
|
1706
1779
|
MODEL_LARGE,
|
1707
1780
|
MODEL_XL,
|
1781
|
+
MODEL_A2_7B,
|
1708
1782
|
MODEL_8x7B,
|
1709
1783
|
MODEL_8x22B,
|
1784
|
+
MODEL_16x12B,
|
1710
1785
|
};
|
1711
1786
|
|
1712
1787
|
static const size_t kiB = 1024;
|
@@ -1890,6 +1965,12 @@ struct llama_layer {
|
|
1890
1965
|
struct ggml_tensor * ffn_down_exps;
|
1891
1966
|
struct ggml_tensor * ffn_up_exps ;
|
1892
1967
|
|
1968
|
+
// ff shared expert (shexp)
|
1969
|
+
struct ggml_tensor * ffn_gate_inp_shexp;
|
1970
|
+
struct ggml_tensor * ffn_gate_shexp;
|
1971
|
+
struct ggml_tensor * ffn_down_shexp;
|
1972
|
+
struct ggml_tensor * ffn_up_shexp;
|
1973
|
+
|
1893
1974
|
// ff bias
|
1894
1975
|
struct ggml_tensor * ffn_down_b; // b2
|
1895
1976
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -2036,10 +2117,10 @@ struct llama_vocab {
|
|
2036
2117
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2037
2118
|
|
2038
2119
|
id linefeed_id = 13;
|
2039
|
-
id special_prefix_id =
|
2040
|
-
id
|
2041
|
-
id
|
2042
|
-
id special_eot_id =
|
2120
|
+
id special_prefix_id = -1;
|
2121
|
+
id special_suffix_id = -1;
|
2122
|
+
id special_middle_id = -1;
|
2123
|
+
id special_eot_id = -1;
|
2043
2124
|
|
2044
2125
|
bool add_space_prefix = true;
|
2045
2126
|
|
@@ -3545,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3545
3626
|
case MODEL_3B: return "3B";
|
3546
3627
|
case MODEL_7B: return "7B";
|
3547
3628
|
case MODEL_8B: return "8B";
|
3629
|
+
case MODEL_12B: return "12B";
|
3548
3630
|
case MODEL_13B: return "13B";
|
3549
3631
|
case MODEL_14B: return "14B";
|
3550
3632
|
case MODEL_15B: return "15B";
|
@@ -3560,8 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
|
|
3560
3642
|
case MODEL_MEDIUM: return "0.4B";
|
3561
3643
|
case MODEL_LARGE: return "0.8B";
|
3562
3644
|
case MODEL_XL: return "1.5B";
|
3645
|
+
case MODEL_A2_7B: return "A2.7B";
|
3563
3646
|
case MODEL_8x7B: return "8x7B";
|
3564
3647
|
case MODEL_8x22B: return "8x22B";
|
3648
|
+
case MODEL_16x12B: return "16x12B";
|
3565
3649
|
default: return "?B";
|
3566
3650
|
}
|
3567
3651
|
}
|
@@ -3834,6 +3918,7 @@ static void llm_load_hparams(
|
|
3834
3918
|
switch (hparams.n_layer) {
|
3835
3919
|
case 24: model.type = e_model::MODEL_1B; break;
|
3836
3920
|
case 32: model.type = e_model::MODEL_3B; break;
|
3921
|
+
case 40: model.type = e_model::MODEL_12B; break;
|
3837
3922
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3838
3923
|
}
|
3839
3924
|
} break;
|
@@ -3858,6 +3943,14 @@ static void llm_load_hparams(
|
|
3858
3943
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3859
3944
|
}
|
3860
3945
|
} break;
|
3946
|
+
case LLM_ARCH_QWEN2MOE:
|
3947
|
+
{
|
3948
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3949
|
+
switch (hparams.n_layer) {
|
3950
|
+
case 24: model.type = e_model::MODEL_A2_7B; break;
|
3951
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3952
|
+
}
|
3953
|
+
} break;
|
3861
3954
|
case LLM_ARCH_PHI2:
|
3862
3955
|
{
|
3863
3956
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3983,6 +4076,28 @@ static void llm_load_hparams(
|
|
3983
4076
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3984
4077
|
}
|
3985
4078
|
} break;
|
4079
|
+
case LLM_ARCH_DBRX:
|
4080
|
+
{
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4082
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
4083
|
+
|
4084
|
+
switch (hparams.n_layer) {
|
4085
|
+
case 40: model.type = e_model::MODEL_16x12B; break;
|
4086
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4087
|
+
}
|
4088
|
+
} break;
|
4089
|
+
case LLM_ARCH_OLMO:
|
4090
|
+
{
|
4091
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4092
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
4093
|
+
|
4094
|
+
switch (hparams.n_layer) {
|
4095
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
4096
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
4097
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
4098
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4099
|
+
}
|
4100
|
+
} break;
|
3986
4101
|
default: (void)0;
|
3987
4102
|
}
|
3988
4103
|
|
@@ -4042,6 +4157,32 @@ static void llm_load_vocab(
|
|
4042
4157
|
vocab.special_cls_id = -1;
|
4043
4158
|
vocab.special_mask_id = -1;
|
4044
4159
|
|
4160
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4161
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4162
|
+
// will allow those models to continue to work. The general names
|
4163
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4164
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4165
|
+
// new versions of these models have been published.
|
4166
|
+
std::string gen_name;
|
4167
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4168
|
+
|
4169
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4170
|
+
[](unsigned char c){ return std::tolower(c); });
|
4171
|
+
|
4172
|
+
if (gen_name.find("code") != std::string::npos) {
|
4173
|
+
if (model.arch == LLM_ARCH_LLAMA) {
|
4174
|
+
vocab.special_prefix_id = 32007;
|
4175
|
+
vocab.special_suffix_id = 32008;
|
4176
|
+
vocab.special_middle_id = 32009;
|
4177
|
+
vocab.special_eot_id = 32010;
|
4178
|
+
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4179
|
+
vocab.special_prefix_id = 67;
|
4180
|
+
vocab.special_suffix_id = 69;
|
4181
|
+
vocab.special_middle_id = 68;
|
4182
|
+
vocab.special_eot_id = 70;
|
4183
|
+
}
|
4184
|
+
}
|
4185
|
+
|
4045
4186
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4046
4187
|
if (add_space_prefix_keyidx != -1) {
|
4047
4188
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
@@ -4155,13 +4296,17 @@ static void llm_load_vocab(
|
|
4155
4296
|
// special tokens
|
4156
4297
|
{
|
4157
4298
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4158
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4159
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4160
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4161
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4162
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4163
|
-
{ LLM_KV_TOKENIZER_CLS_ID,
|
4164
|
-
{ LLM_KV_TOKENIZER_MASK_ID,
|
4299
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4300
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4301
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4302
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4303
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4304
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4305
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4306
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
4307
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
4308
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4165
4310
|
};
|
4166
4311
|
for (const auto & it : special_token_types) {
|
4167
4312
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4378,6 +4523,13 @@ static bool llm_load_tensors(
|
|
4378
4523
|
|
4379
4524
|
auto & hparams = model.hparams;
|
4380
4525
|
|
4526
|
+
#ifdef GGML_USE_SYCL
|
4527
|
+
// disable MoE with SYCL until mul_mat_id is updated
|
4528
|
+
if (hparams.n_expert > 0) {
|
4529
|
+
n_gpu_layers = 0;
|
4530
|
+
}
|
4531
|
+
#endif
|
4532
|
+
|
4381
4533
|
model.split_mode = split_mode;
|
4382
4534
|
model.main_gpu = main_gpu;
|
4383
4535
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -4475,7 +4627,7 @@ static bool llm_load_tensors(
|
|
4475
4627
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4476
4628
|
|
4477
4629
|
// for moe merged tensors
|
4478
|
-
ctx_size += ggml_tensor_overhead()*
|
4630
|
+
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
4479
4631
|
|
4480
4632
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4481
4633
|
for (auto & it : buft_layer_count) {
|
@@ -4671,6 +4823,39 @@ static bool llm_load_tensors(
|
|
4671
4823
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4672
4824
|
}
|
4673
4825
|
} break;
|
4826
|
+
case LLM_ARCH_DBRX:
|
4827
|
+
{
|
4828
|
+
if (n_expert == 0) {
|
4829
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
4830
|
+
}
|
4831
|
+
|
4832
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4833
|
+
|
4834
|
+
// output
|
4835
|
+
{
|
4836
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4837
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4838
|
+
}
|
4839
|
+
|
4840
|
+
for (int i = 0; i < n_layer; ++i) {
|
4841
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4842
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4843
|
+
|
4844
|
+
auto & layer = model.layers[i];
|
4845
|
+
|
4846
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4847
|
+
|
4848
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4849
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4850
|
+
|
4851
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4852
|
+
|
4853
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4854
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4855
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
4856
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4857
|
+
}
|
4858
|
+
} break;
|
4674
4859
|
case LLM_ARCH_BAICHUAN:
|
4675
4860
|
{
|
4676
4861
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -4985,8 +5170,13 @@ static bool llm_load_tensors(
|
|
4985
5170
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
4986
5171
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
4987
5172
|
|
4988
|
-
|
4989
|
-
layer.
|
5173
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
5174
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
5175
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
5176
|
+
|
5177
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5178
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
5179
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4990
5180
|
|
4991
5181
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4992
5182
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5029,7 +5219,13 @@ static bool llm_load_tensors(
|
|
5029
5219
|
// output
|
5030
5220
|
{
|
5031
5221
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5032
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5222
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5223
|
+
// if output is NULL, init from the input tok embed
|
5224
|
+
if (model.output == NULL) {
|
5225
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5226
|
+
ml.n_created--; // artificial tensor
|
5227
|
+
ml.size_data += ggml_nbytes(model.output);
|
5228
|
+
}
|
5033
5229
|
}
|
5034
5230
|
|
5035
5231
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5057,6 +5253,54 @@ static bool llm_load_tensors(
|
|
5057
5253
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5058
5254
|
}
|
5059
5255
|
} break;
|
5256
|
+
case LLM_ARCH_QWEN2MOE:
|
5257
|
+
{
|
5258
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5259
|
+
|
5260
|
+
// output
|
5261
|
+
{
|
5262
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5263
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5264
|
+
}
|
5265
|
+
|
5266
|
+
for (int i = 0; i < n_layer; ++i) {
|
5267
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5268
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5269
|
+
|
5270
|
+
auto & layer = model.layers[i];
|
5271
|
+
|
5272
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5273
|
+
|
5274
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5275
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5276
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5277
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5278
|
+
|
5279
|
+
// optional bias tensors
|
5280
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5281
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5282
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5283
|
+
|
5284
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5285
|
+
|
5286
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5287
|
+
|
5288
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
5289
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
5290
|
+
|
5291
|
+
// MoE branch
|
5292
|
+
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
5293
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5294
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5295
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5296
|
+
|
5297
|
+
// Shared expert branch
|
5298
|
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5299
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
5300
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
5301
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
5302
|
+
}
|
5303
|
+
} break;
|
5060
5304
|
case LLM_ARCH_PHI2:
|
5061
5305
|
{
|
5062
5306
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5450,6 +5694,37 @@ static bool llm_load_tensors(
|
|
5450
5694
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5451
5695
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5452
5696
|
|
5697
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5698
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5699
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5700
|
+
}
|
5701
|
+
} break;
|
5702
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
5703
|
+
{
|
5704
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5705
|
+
|
5706
|
+
// output
|
5707
|
+
{
|
5708
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5709
|
+
// if output is NULL, init from the input tok embed
|
5710
|
+
if (model.output == NULL) {
|
5711
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5712
|
+
ml.n_created--; // artificial tensor
|
5713
|
+
ml.size_data += ggml_nbytes(model.output);
|
5714
|
+
}
|
5715
|
+
}
|
5716
|
+
|
5717
|
+
for (int i = 0; i < n_layer; ++i) {
|
5718
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5719
|
+
|
5720
|
+
auto & layer = model.layers[i];
|
5721
|
+
|
5722
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5723
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5724
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5725
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5726
|
+
|
5727
|
+
|
5453
5728
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5454
5729
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5455
5730
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
@@ -5890,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
5890
6165
|
return cur;
|
5891
6166
|
}
|
5892
6167
|
|
6168
|
+
static struct ggml_tensor * llm_build_moe_ffn(
|
6169
|
+
struct ggml_context * ctx,
|
6170
|
+
struct ggml_tensor * cur,
|
6171
|
+
struct ggml_tensor * gate_inp,
|
6172
|
+
struct ggml_tensor * up_exps,
|
6173
|
+
struct ggml_tensor * gate_exps,
|
6174
|
+
struct ggml_tensor * down_exps,
|
6175
|
+
int64_t n_expert,
|
6176
|
+
int64_t n_expert_used,
|
6177
|
+
llm_ffn_op_type type_op,
|
6178
|
+
bool norm_w,
|
6179
|
+
const llm_build_cb & cb,
|
6180
|
+
int il) {
|
6181
|
+
int64_t n_embd = cur->ne[0];
|
6182
|
+
int64_t n_tokens = cur->ne[1];
|
6183
|
+
|
6184
|
+
ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
6185
|
+
cb(logits, "ffn_moe_logits", il);
|
6186
|
+
|
6187
|
+
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
6188
|
+
cb(probs, "ffn_moe_probs", il);
|
6189
|
+
|
6190
|
+
// select experts
|
6191
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
6192
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6193
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
6194
|
+
|
6195
|
+
ggml_tensor * weights = ggml_get_rows(ctx,
|
6196
|
+
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
6197
|
+
cb(weights, "ffn_moe_weights", il);
|
6198
|
+
|
6199
|
+
if (norm_w) {
|
6200
|
+
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
6201
|
+
|
6202
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
6203
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6204
|
+
|
6205
|
+
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
6206
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6207
|
+
|
6208
|
+
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6209
|
+
}
|
6210
|
+
|
6211
|
+
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6212
|
+
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6213
|
+
cb(up, "ffn_moe_up", il);
|
6214
|
+
|
6215
|
+
ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6216
|
+
cb(gate, "ffn_moe_gate", il);
|
6217
|
+
|
6218
|
+
switch (type_op) {
|
6219
|
+
case LLM_FFN_SILU:
|
6220
|
+
{
|
6221
|
+
gate = ggml_silu(ctx, gate);
|
6222
|
+
cb(gate, "ffn_moe_silu", il);
|
6223
|
+
} break;
|
6224
|
+
case LLM_FFN_GELU:
|
6225
|
+
{
|
6226
|
+
gate = ggml_gelu(ctx, gate);
|
6227
|
+
cb(gate, "ffn_moe_gelu", il);
|
6228
|
+
} break;
|
6229
|
+
default:
|
6230
|
+
GGML_ASSERT(false);
|
6231
|
+
}
|
6232
|
+
|
6233
|
+
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
6234
|
+
cb(par, "ffn_moe_gate_par", il);
|
6235
|
+
|
6236
|
+
ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
6237
|
+
cb(experts, "ffn_moe_down", il);
|
6238
|
+
|
6239
|
+
experts = ggml_mul(ctx, experts, weights);
|
6240
|
+
|
6241
|
+
// aggregate experts
|
6242
|
+
ggml_tensor * moe_out = nullptr;
|
6243
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6244
|
+
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
6245
|
+
experts->nb[2], i*experts->nb[1]);
|
6246
|
+
|
6247
|
+
if (i == 0) {
|
6248
|
+
moe_out = cur_expert;
|
6249
|
+
} else {
|
6250
|
+
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
6251
|
+
}
|
6252
|
+
}
|
6253
|
+
|
6254
|
+
if (n_expert_used == 1) {
|
6255
|
+
// avoid returning a non-contiguous tensor
|
6256
|
+
moe_out = ggml_cont(ctx, moe_out);
|
6257
|
+
}
|
6258
|
+
|
6259
|
+
return moe_out;
|
6260
|
+
}
|
6261
|
+
|
5893
6262
|
// if max_alibi_bias > 0 then apply ALiBi
|
5894
6263
|
static struct ggml_tensor * llm_build_kqv(
|
5895
6264
|
struct ggml_context * ctx,
|
@@ -6433,62 +6802,15 @@ struct llm_build_context {
|
|
6433
6802
|
LLM_NORM_RMS, cb, il);
|
6434
6803
|
cb(cur, "ffn_norm", il);
|
6435
6804
|
|
6436
|
-
|
6437
|
-
|
6438
|
-
|
6439
|
-
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
cb(
|
6445
|
-
|
6446
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6447
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6448
|
-
cb(weights, "ffn_moe_weights", il);
|
6449
|
-
|
6450
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6451
|
-
|
6452
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6453
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6454
|
-
|
6455
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6456
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6457
|
-
|
6458
|
-
// compute expert outputs
|
6459
|
-
ggml_tensor * moe_out = nullptr;
|
6460
|
-
|
6461
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6462
|
-
ggml_tensor * cur_expert;
|
6463
|
-
|
6464
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6465
|
-
cb(cur_up, "ffn_moe_up", il);
|
6466
|
-
|
6467
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6468
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6469
|
-
|
6470
|
-
cur_gate = ggml_silu(ctx0, cur_gate);
|
6471
|
-
cb(cur_gate, "ffn_moe_silu", il);
|
6472
|
-
|
6473
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6474
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6475
|
-
|
6476
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6477
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6478
|
-
|
6479
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6480
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6481
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6482
|
-
|
6483
|
-
if (i == 0) {
|
6484
|
-
moe_out = cur_expert;
|
6485
|
-
} else {
|
6486
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6487
|
-
cb(moe_out, "ffn_moe_out", il);
|
6488
|
-
}
|
6489
|
-
}
|
6490
|
-
|
6491
|
-
cur = moe_out;
|
6805
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
6806
|
+
model.layers[il].ffn_gate_inp,
|
6807
|
+
model.layers[il].ffn_up_exps,
|
6808
|
+
model.layers[il].ffn_gate_exps,
|
6809
|
+
model.layers[il].ffn_down_exps,
|
6810
|
+
n_expert, n_expert_used,
|
6811
|
+
LLM_FFN_SILU, true,
|
6812
|
+
cb, il);
|
6813
|
+
cb(cur, "ffn_moe_out", il);
|
6492
6814
|
}
|
6493
6815
|
|
6494
6816
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -6967,74 +7289,158 @@ struct llm_build_context {
|
|
6967
7289
|
LLM_NORM_RMS, cb, il);
|
6968
7290
|
cb(cur, "ffn_norm", il);
|
6969
7291
|
|
6970
|
-
|
6971
|
-
|
7292
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7293
|
+
model.layers[il].ffn_gate_inp,
|
7294
|
+
model.layers[il].ffn_up_exps,
|
7295
|
+
model.layers[il].ffn_gate_exps,
|
7296
|
+
model.layers[il].ffn_down_exps,
|
7297
|
+
n_expert, n_expert_used,
|
7298
|
+
LLM_FFN_GELU, true,
|
7299
|
+
cb, il);
|
7300
|
+
cb(cur, "ffn_moe_out", il);
|
7301
|
+
|
7302
|
+
// Grok
|
7303
|
+
// if layer_out_norm is present then apply it before adding the input
|
7304
|
+
// Idea: maybe ffn_out_norm is a better name
|
7305
|
+
if (model.layers[il].layer_out_norm) {
|
7306
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7307
|
+
model.layers[il].layer_out_norm, NULL,
|
7308
|
+
LLM_NORM_RMS, cb, il);
|
7309
|
+
cb(cur, "layer_out_norm", il);
|
7310
|
+
}
|
7311
|
+
|
7312
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7313
|
+
cb(cur, "ffn_out", il);
|
7314
|
+
|
7315
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7316
|
+
if (layer_dir != nullptr) {
|
7317
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7318
|
+
}
|
7319
|
+
cb(cur, "l_out", il);
|
7320
|
+
|
7321
|
+
// input for next layer
|
7322
|
+
inpL = cur;
|
7323
|
+
}
|
6972
7324
|
|
6973
|
-
|
6974
|
-
cb(probs, "ffn_moe_probs", il);
|
7325
|
+
cur = inpL;
|
6975
7326
|
|
6976
|
-
|
6977
|
-
|
6978
|
-
|
7327
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7328
|
+
model.output_norm, NULL,
|
7329
|
+
LLM_NORM_RMS, cb, -1);
|
7330
|
+
cb(cur, "result_norm", -1);
|
6979
7331
|
|
6980
|
-
|
6981
|
-
|
6982
|
-
cb(weights, "ffn_moe_weights", il);
|
7332
|
+
// lm_head
|
7333
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6983
7334
|
|
6984
|
-
|
7335
|
+
// Grok
|
7336
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
6985
7337
|
|
6986
|
-
|
6987
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
7338
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
6988
7339
|
|
6989
|
-
|
6990
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
7340
|
+
cb(cur, "result_output", -1);
|
6991
7341
|
|
6992
|
-
|
6993
|
-
ggml_tensor * moe_out = nullptr;
|
7342
|
+
ggml_build_forward_expand(gf, cur);
|
6994
7343
|
|
6995
|
-
|
6996
|
-
|
7344
|
+
return gf;
|
7345
|
+
}
|
6997
7346
|
|
6998
|
-
|
6999
|
-
|
7347
|
+
struct ggml_cgraph * build_dbrx() {
|
7348
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7000
7349
|
|
7001
|
-
|
7002
|
-
|
7350
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7351
|
+
int32_t n_tokens = this->n_tokens;
|
7003
7352
|
|
7004
|
-
|
7005
|
-
|
7006
|
-
|
7353
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7354
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7355
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7356
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
7007
7357
|
|
7008
|
-
|
7009
|
-
|
7358
|
+
struct ggml_tensor * cur;
|
7359
|
+
struct ggml_tensor * inpL;
|
7010
7360
|
|
7011
|
-
|
7012
|
-
cb(cur_expert, "ffn_moe_down", il);
|
7361
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7013
7362
|
|
7014
|
-
|
7015
|
-
|
7016
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
7363
|
+
// inp_pos - contains the positions
|
7364
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7017
7365
|
|
7018
|
-
|
7019
|
-
|
7020
|
-
} else {
|
7021
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
7022
|
-
cb(moe_out, "ffn_moe_out", il);
|
7023
|
-
}
|
7024
|
-
}
|
7366
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7367
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7025
7368
|
|
7026
|
-
|
7369
|
+
for (int il = 0; il < n_layer; ++il) {
|
7370
|
+
struct ggml_tensor * inpSA = inpL;
|
7027
7371
|
|
7028
|
-
//
|
7029
|
-
|
7030
|
-
|
7031
|
-
|
7032
|
-
|
7033
|
-
|
7034
|
-
|
7035
|
-
|
7372
|
+
// norm
|
7373
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7374
|
+
model.layers[il].attn_norm, NULL,
|
7375
|
+
LLM_NORM, cb, il);
|
7376
|
+
cb(cur, "attn_norm", il);
|
7377
|
+
|
7378
|
+
// self-attention
|
7379
|
+
{
|
7380
|
+
struct ggml_tensor * Qcur = nullptr;
|
7381
|
+
struct ggml_tensor * Kcur = nullptr;
|
7382
|
+
struct ggml_tensor * Vcur = nullptr;
|
7383
|
+
|
7384
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7385
|
+
cb(cur, "wqkv", il);
|
7386
|
+
|
7387
|
+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
7388
|
+
cb(cur, "wqkv_clamped", il);
|
7389
|
+
|
7390
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7391
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7392
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7393
|
+
|
7394
|
+
cb(Qcur, "Qcur", il);
|
7395
|
+
cb(Kcur, "Kcur", il);
|
7396
|
+
cb(Vcur, "Vcur", il);
|
7397
|
+
|
7398
|
+
Qcur = ggml_rope_custom(
|
7399
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7401
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7402
|
+
);
|
7403
|
+
cb(Qcur, "Qcur", il);
|
7404
|
+
|
7405
|
+
Kcur = ggml_rope_custom(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7407
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7408
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7409
|
+
);
|
7410
|
+
cb(Kcur, "Kcur", il);
|
7411
|
+
|
7412
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7413
|
+
model.layers[il].wo, NULL,
|
7414
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7036
7415
|
}
|
7037
7416
|
|
7417
|
+
if (il == n_layer - 1) {
|
7418
|
+
// skip computing output for unused tokens
|
7419
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7420
|
+
n_tokens = n_outputs;
|
7421
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7422
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7423
|
+
}
|
7424
|
+
|
7425
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7426
|
+
cb(ffn_inp, "ffn_inp", il);
|
7427
|
+
|
7428
|
+
// feed-forward network
|
7429
|
+
// MoE branch
|
7430
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7431
|
+
model.layers[il].attn_out_norm, NULL,
|
7432
|
+
LLM_NORM, cb, il);
|
7433
|
+
cb(cur, "attn_out_norm", il);
|
7434
|
+
|
7435
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7436
|
+
model.layers[il].ffn_gate_inp,
|
7437
|
+
model.layers[il].ffn_up_exps,
|
7438
|
+
model.layers[il].ffn_gate_exps,
|
7439
|
+
model.layers[il].ffn_down_exps,
|
7440
|
+
n_expert, n_expert_used,
|
7441
|
+
LLM_FFN_SILU, true,
|
7442
|
+
cb, il);
|
7443
|
+
cb(cur, "ffn_moe_out", il);
|
7038
7444
|
|
7039
7445
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
7040
7446
|
cb(cur, "ffn_out", il);
|
@@ -7052,18 +7458,13 @@ struct llm_build_context {
|
|
7052
7458
|
cur = inpL;
|
7053
7459
|
|
7054
7460
|
cur = llm_build_norm(ctx0, cur, hparams,
|
7055
|
-
|
7056
|
-
|
7461
|
+
model.output_norm, NULL,
|
7462
|
+
LLM_NORM, cb, -1);
|
7057
7463
|
cb(cur, "result_norm", -1);
|
7058
7464
|
|
7059
7465
|
// lm_head
|
7060
7466
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7061
7467
|
|
7062
|
-
// Grok
|
7063
|
-
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7064
|
-
|
7065
|
-
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7066
|
-
|
7067
7468
|
cb(cur, "result_output", -1);
|
7068
7469
|
|
7069
7470
|
ggml_build_forward_expand(gf, cur);
|
@@ -7923,7 +8324,7 @@ struct llm_build_context {
|
|
7923
8324
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7924
8325
|
|
7925
8326
|
for (int il = 0; il < n_layer; ++il) {
|
7926
|
-
|
8327
|
+
|
7927
8328
|
|
7928
8329
|
// norm
|
7929
8330
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7932,6 +8333,8 @@ struct llm_build_context {
|
|
7932
8333
|
LLM_NORM, cb, il);
|
7933
8334
|
cb(cur, "attn_norm", il);
|
7934
8335
|
|
8336
|
+
struct ggml_tensor * inpSA = cur;
|
8337
|
+
|
7935
8338
|
// self-attention
|
7936
8339
|
{
|
7937
8340
|
// compute Q and K and RoPE them
|
@@ -7956,15 +8359,36 @@ struct llm_build_context {
|
|
7956
8359
|
cb(Vcur, "Vcur", il);
|
7957
8360
|
}
|
7958
8361
|
|
8362
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8363
|
+
cb(Qcur, "Qcur", il);
|
8364
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8365
|
+
cb(Kcur, "Kcur", il);
|
8366
|
+
|
8367
|
+
if (model.layers[il].attn_q_norm) {
|
8368
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8369
|
+
model.layers[il].attn_q_norm,
|
8370
|
+
NULL,
|
8371
|
+
LLM_NORM, cb, il);
|
8372
|
+
cb(Qcur, "Qcur", il);
|
8373
|
+
}
|
8374
|
+
if (model.layers[il].attn_k_norm) {
|
8375
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8376
|
+
model.layers[il].attn_k_norm,
|
8377
|
+
NULL,
|
8378
|
+
LLM_NORM, cb, il);
|
8379
|
+
cb(Kcur, "Kcur", il);
|
8380
|
+
}
|
8381
|
+
|
8382
|
+
|
7959
8383
|
Qcur = ggml_rope_custom(
|
7960
|
-
ctx0,
|
8384
|
+
ctx0, Qcur, inp_pos,
|
7961
8385
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7962
8386
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7963
8387
|
);
|
7964
8388
|
cb(Qcur, "Qcur", il);
|
7965
8389
|
|
7966
8390
|
Kcur = ggml_rope_custom(
|
7967
|
-
ctx0,
|
8391
|
+
ctx0, Kcur, inp_pos,
|
7968
8392
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7969
8393
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7970
8394
|
);
|
@@ -7979,20 +8403,25 @@ struct llm_build_context {
|
|
7979
8403
|
// skip computing output for unused tokens
|
7980
8404
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7981
8405
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8406
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7982
8407
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7983
8408
|
}
|
7984
8409
|
|
7985
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur,
|
8410
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7986
8411
|
cb(ffn_inp, "ffn_inp", il);
|
7987
8412
|
|
7988
8413
|
// feed-forward network
|
7989
8414
|
{
|
7990
|
-
|
7991
|
-
|
7992
|
-
|
7993
|
-
|
7994
|
-
|
7995
|
-
|
8415
|
+
if (model.layers[il].ffn_norm) {
|
8416
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8417
|
+
model.layers[il].ffn_norm,
|
8418
|
+
model.layers[il].ffn_norm_b,
|
8419
|
+
LLM_NORM, cb, il);
|
8420
|
+
cb(cur, "ffn_norm", il);
|
8421
|
+
} else {
|
8422
|
+
// parallel residual
|
8423
|
+
cur = inpSA;
|
8424
|
+
}
|
7996
8425
|
cur = llm_build_ffn(ctx0, cur,
|
7997
8426
|
model.layers[il].ffn_up, NULL,
|
7998
8427
|
model.layers[il].ffn_gate, NULL,
|
@@ -8182,12 +8611,6 @@ struct llm_build_context {
|
|
8182
8611
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8183
8612
|
cb(Vcur, "Vcur", il);
|
8184
8613
|
|
8185
|
-
// these nodes are added to the graph together so that they are not reordered
|
8186
|
-
// by doing so, the number of splits in the graph is reduced
|
8187
|
-
ggml_build_forward_expand(gf, Qcur);
|
8188
|
-
ggml_build_forward_expand(gf, Kcur);
|
8189
|
-
ggml_build_forward_expand(gf, Vcur);
|
8190
|
-
|
8191
8614
|
Qcur = ggml_rope_custom(
|
8192
8615
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8193
8616
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8254,6 +8677,150 @@ struct llm_build_context {
|
|
8254
8677
|
return gf;
|
8255
8678
|
}
|
8256
8679
|
|
8680
|
+
struct ggml_cgraph * build_qwen2moe() {
|
8681
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8682
|
+
|
8683
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8684
|
+
int32_t n_tokens = this->n_tokens;
|
8685
|
+
|
8686
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8687
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8688
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8689
|
+
|
8690
|
+
struct ggml_tensor * cur;
|
8691
|
+
struct ggml_tensor * inpL;
|
8692
|
+
|
8693
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8694
|
+
|
8695
|
+
// inp_pos - contains the positions
|
8696
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8697
|
+
|
8698
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8699
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8700
|
+
|
8701
|
+
for (int il = 0; il < n_layer; ++il) {
|
8702
|
+
struct ggml_tensor * inpSA = inpL;
|
8703
|
+
|
8704
|
+
// norm
|
8705
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8706
|
+
model.layers[il].attn_norm, NULL,
|
8707
|
+
LLM_NORM_RMS, cb, il);
|
8708
|
+
cb(cur, "attn_norm", il);
|
8709
|
+
|
8710
|
+
// self_attention
|
8711
|
+
{
|
8712
|
+
// compute Q and K and RoPE them
|
8713
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8714
|
+
cb(Qcur, "Qcur", il);
|
8715
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8716
|
+
cb(Qcur, "Qcur", il);
|
8717
|
+
|
8718
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8719
|
+
cb(Kcur, "Kcur", il);
|
8720
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8721
|
+
cb(Kcur, "Kcur", il);
|
8722
|
+
|
8723
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8724
|
+
cb(Vcur, "Vcur", il);
|
8725
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8726
|
+
cb(Vcur, "Vcur", il);
|
8727
|
+
|
8728
|
+
Qcur = ggml_rope_custom(
|
8729
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8730
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8731
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8732
|
+
);
|
8733
|
+
cb(Qcur, "Qcur", il);
|
8734
|
+
|
8735
|
+
Kcur = ggml_rope_custom(
|
8736
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8737
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8738
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8739
|
+
);
|
8740
|
+
cb(Kcur, "Kcur", il);
|
8741
|
+
|
8742
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8743
|
+
model.layers[il].wo, model.layers[il].bo,
|
8744
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8745
|
+
}
|
8746
|
+
|
8747
|
+
if (il == n_layer - 1) {
|
8748
|
+
// skip computing output for unused tokens
|
8749
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8750
|
+
n_tokens = n_outputs;
|
8751
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8752
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8753
|
+
}
|
8754
|
+
|
8755
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8756
|
+
cb(ffn_inp, "ffn_inp", il);
|
8757
|
+
|
8758
|
+
// MoE branch
|
8759
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8760
|
+
model.layers[il].ffn_norm, NULL,
|
8761
|
+
LLM_NORM_RMS, cb, il);
|
8762
|
+
cb(cur, "ffn_norm", il);
|
8763
|
+
|
8764
|
+
ggml_tensor * moe_out =
|
8765
|
+
llm_build_moe_ffn(ctx0, cur,
|
8766
|
+
model.layers[il].ffn_gate_inp,
|
8767
|
+
model.layers[il].ffn_up_exps,
|
8768
|
+
model.layers[il].ffn_gate_exps,
|
8769
|
+
model.layers[il].ffn_down_exps,
|
8770
|
+
n_expert, n_expert_used,
|
8771
|
+
LLM_FFN_SILU, false,
|
8772
|
+
cb, il);
|
8773
|
+
cb(cur, "ffn_moe_out", il);
|
8774
|
+
|
8775
|
+
// FFN shared expert
|
8776
|
+
{
|
8777
|
+
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
8778
|
+
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
8779
|
+
|
8780
|
+
// sigmoid
|
8781
|
+
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
8782
|
+
cb(cur_gate, "ffn_shexp_gate", il);
|
8783
|
+
|
8784
|
+
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
8785
|
+
model.layers[il].ffn_up_shexp, NULL,
|
8786
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
8787
|
+
model.layers[il].ffn_down_shexp, NULL,
|
8788
|
+
NULL,
|
8789
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
8790
|
+
cb(cur_ffn, "ffn_shexp", il);
|
8791
|
+
|
8792
|
+
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
8793
|
+
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
8794
|
+
|
8795
|
+
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
8796
|
+
cb(moe_out, "ffn_out", il);
|
8797
|
+
|
8798
|
+
cur = moe_out;
|
8799
|
+
}
|
8800
|
+
|
8801
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8802
|
+
cb(cur, "l_out", il);
|
8803
|
+
|
8804
|
+
// input for next layer
|
8805
|
+
inpL = cur;
|
8806
|
+
}
|
8807
|
+
|
8808
|
+
cur = inpL;
|
8809
|
+
|
8810
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8811
|
+
model.output_norm, NULL,
|
8812
|
+
LLM_NORM_RMS, cb, -1);
|
8813
|
+
cb(cur, "result_norm", -1);
|
8814
|
+
|
8815
|
+
// lm_head
|
8816
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8817
|
+
cb(cur, "result_output", -1);
|
8818
|
+
|
8819
|
+
ggml_build_forward_expand(gf, cur);
|
8820
|
+
|
8821
|
+
return gf;
|
8822
|
+
}
|
8823
|
+
|
8257
8824
|
struct ggml_cgraph * build_phi2() {
|
8258
8825
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8259
8826
|
|
@@ -9588,6 +10155,139 @@ struct llm_build_context {
|
|
9588
10155
|
return gf;
|
9589
10156
|
|
9590
10157
|
}
|
10158
|
+
|
10159
|
+
// ref: https://allenai.org/olmo
|
10160
|
+
// based on the original build_llama() function, changes:
|
10161
|
+
// * non-parametric layer norm
|
10162
|
+
// * clamp qkv
|
10163
|
+
// * removed bias
|
10164
|
+
// * removed MoE
|
10165
|
+
struct ggml_cgraph * build_olmo() {
|
10166
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10167
|
+
|
10168
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10169
|
+
int32_t n_tokens = this->n_tokens;
|
10170
|
+
|
10171
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10172
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10173
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10174
|
+
|
10175
|
+
struct ggml_tensor * cur;
|
10176
|
+
struct ggml_tensor * inpL;
|
10177
|
+
|
10178
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10179
|
+
|
10180
|
+
// inp_pos - contains the positions
|
10181
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10182
|
+
|
10183
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10184
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10185
|
+
|
10186
|
+
for (int il = 0; il < n_layer; ++il) {
|
10187
|
+
struct ggml_tensor * inpSA = inpL;
|
10188
|
+
|
10189
|
+
// norm
|
10190
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10191
|
+
NULL, NULL,
|
10192
|
+
LLM_NORM, cb, il);
|
10193
|
+
cb(cur, "attn_norm", il);
|
10194
|
+
|
10195
|
+
// self-attention
|
10196
|
+
{
|
10197
|
+
// compute Q and K and RoPE them
|
10198
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10199
|
+
cb(Qcur, "Qcur", il);
|
10200
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10201
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10202
|
+
cb(Qcur, "Qcur", il);
|
10203
|
+
}
|
10204
|
+
|
10205
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10206
|
+
cb(Kcur, "Kcur", il);
|
10207
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10208
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10209
|
+
cb(Kcur, "Kcur", il);
|
10210
|
+
}
|
10211
|
+
|
10212
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10213
|
+
cb(Vcur, "Vcur", il);
|
10214
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10215
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10216
|
+
cb(Vcur, "Vcur", il);
|
10217
|
+
}
|
10218
|
+
|
10219
|
+
Qcur = ggml_rope_custom(
|
10220
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10221
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10222
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10223
|
+
);
|
10224
|
+
cb(Qcur, "Qcur", il);
|
10225
|
+
|
10226
|
+
Kcur = ggml_rope_custom(
|
10227
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10228
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10229
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10230
|
+
);
|
10231
|
+
cb(Kcur, "Kcur", il);
|
10232
|
+
|
10233
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10234
|
+
model.layers[il].wo, nullptr,
|
10235
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10236
|
+
}
|
10237
|
+
|
10238
|
+
if (il == n_layer - 1) {
|
10239
|
+
// skip computing output for unused tokens
|
10240
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10241
|
+
n_tokens = n_outputs;
|
10242
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10243
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10244
|
+
}
|
10245
|
+
|
10246
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10247
|
+
cb(ffn_inp, "ffn_inp", il);
|
10248
|
+
|
10249
|
+
// feed-forward network
|
10250
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10251
|
+
NULL, NULL,
|
10252
|
+
LLM_NORM, cb, il);
|
10253
|
+
cb(cur, "ffn_norm", il);
|
10254
|
+
|
10255
|
+
cur = llm_build_ffn(ctx0, cur,
|
10256
|
+
model.layers[il].ffn_up, NULL,
|
10257
|
+
model.layers[il].ffn_gate, NULL,
|
10258
|
+
model.layers[il].ffn_down, NULL,
|
10259
|
+
NULL,
|
10260
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10261
|
+
cb(cur, "ffn_out", il);
|
10262
|
+
|
10263
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10264
|
+
cb(cur, "ffn_out", il);
|
10265
|
+
|
10266
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10267
|
+
if (layer_dir != nullptr) {
|
10268
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10269
|
+
}
|
10270
|
+
cb(cur, "l_out", il);
|
10271
|
+
|
10272
|
+
// input for next layer
|
10273
|
+
inpL = cur;
|
10274
|
+
}
|
10275
|
+
|
10276
|
+
cur = inpL;
|
10277
|
+
|
10278
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10279
|
+
NULL, NULL,
|
10280
|
+
LLM_NORM, cb, -1);
|
10281
|
+
cb(cur, "result_norm", -1);
|
10282
|
+
|
10283
|
+
// lm_head
|
10284
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10285
|
+
cb(cur, "result_output", -1);
|
10286
|
+
|
10287
|
+
ggml_build_forward_expand(gf, cur);
|
10288
|
+
|
10289
|
+
return gf;
|
10290
|
+
}
|
9591
10291
|
};
|
9592
10292
|
|
9593
10293
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -9737,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9737
10437
|
{
|
9738
10438
|
result = llm.build_qwen2();
|
9739
10439
|
} break;
|
10440
|
+
case LLM_ARCH_QWEN2MOE:
|
10441
|
+
{
|
10442
|
+
result = llm.build_qwen2moe();
|
10443
|
+
} break;
|
9740
10444
|
case LLM_ARCH_PHI2:
|
9741
10445
|
{
|
9742
10446
|
result = llm.build_phi2();
|
@@ -9785,6 +10489,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9785
10489
|
{
|
9786
10490
|
result = llm.build_command_r();
|
9787
10491
|
} break;
|
10492
|
+
case LLM_ARCH_DBRX:
|
10493
|
+
{
|
10494
|
+
result = llm.build_dbrx();
|
10495
|
+
} break;
|
10496
|
+
case LLM_ARCH_OLMO:
|
10497
|
+
{
|
10498
|
+
result = llm.build_olmo();
|
10499
|
+
} break;
|
9788
10500
|
default:
|
9789
10501
|
GGML_ASSERT(false);
|
9790
10502
|
}
|
@@ -12915,6 +13627,11 @@ struct llama_beam_search_data {
|
|
12915
13627
|
}
|
12916
13628
|
llama_logit_info logit_info(ctx);
|
12917
13629
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
13630
|
+
|
13631
|
+
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
13632
|
+
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
13633
|
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
13634
|
+
|
12918
13635
|
size_t i=0;
|
12919
13636
|
if (next_beams.size() < n_beams) {
|
12920
13637
|
for (; next_beams.size() < n_beams ; ++i) {
|
@@ -13535,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13535
14252
|
gguf_set_kv (ctx_out, ml.meta);
|
13536
14253
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
13537
14254
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
14255
|
+
// Remove split metadata
|
14256
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
14257
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
14258
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
13538
14259
|
|
13539
14260
|
if (params->kv_overrides) {
|
13540
14261
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
@@ -14629,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14629
15350
|
case LLM_ARCH_MINICPM:
|
14630
15351
|
case LLM_ARCH_XVERSE:
|
14631
15352
|
case LLM_ARCH_COMMAND_R:
|
15353
|
+
case LLM_ARCH_OLMO:
|
14632
15354
|
return LLAMA_ROPE_TYPE_NORM;
|
14633
15355
|
|
14634
15356
|
// the pairs of head values are offset by n_rot/2
|
14635
15357
|
case LLM_ARCH_FALCON:
|
14636
15358
|
case LLM_ARCH_GROK:
|
15359
|
+
case LLM_ARCH_DBRX:
|
14637
15360
|
case LLM_ARCH_PERSIMMON:
|
14638
15361
|
case LLM_ARCH_BERT:
|
14639
15362
|
case LLM_ARCH_NOMIC_BERT:
|
14640
15363
|
case LLM_ARCH_STABLELM:
|
14641
15364
|
case LLM_ARCH_QWEN:
|
14642
15365
|
case LLM_ARCH_QWEN2:
|
15366
|
+
case LLM_ARCH_QWEN2MOE:
|
14643
15367
|
case LLM_ARCH_PHI2:
|
14644
15368
|
case LLM_ARCH_GEMMA:
|
14645
15369
|
case LLM_ARCH_STARCODER2:
|
@@ -15320,6 +16044,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15320
16044
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15321
16045
|
ctx->output_ids[id] = i;
|
15322
16046
|
}
|
16047
|
+
|
16048
|
+
ctx->n_outputs = n_outputs;
|
15323
16049
|
}
|
15324
16050
|
}
|
15325
16051
|
|
@@ -16472,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
|
|
16472
17198
|
if (add_ass) {
|
16473
17199
|
ss << "### Response:\n";
|
16474
17200
|
}
|
17201
|
+
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
17202
|
+
// CohereForAI/c4ai-command-r-plus
|
17203
|
+
for (auto message : chat) {
|
17204
|
+
std::string role(message->role);
|
17205
|
+
if (role == "system") {
|
17206
|
+
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17207
|
+
} else if (role == "user") {
|
17208
|
+
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17209
|
+
} else if (role == "assistant") {
|
17210
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17211
|
+
}
|
17212
|
+
}
|
17213
|
+
if (add_ass) {
|
17214
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
|
+
}
|
16475
17216
|
} else {
|
16476
17217
|
// template not supported
|
16477
17218
|
return -1;
|