llama_cpp 0.14.5 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +18 -6
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +153 -87
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +885 -144
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -105,7 +105,7 @@
|
|
105
105
|
#endif
|
106
106
|
|
107
107
|
#define LLAMA_MAX_NODES 8192
|
108
|
-
#define LLAMA_MAX_EXPERTS
|
108
|
+
#define LLAMA_MAX_EXPERTS 60
|
109
109
|
|
110
110
|
|
111
111
|
//
|
@@ -209,6 +209,7 @@ enum llm_arch {
|
|
209
209
|
LLM_ARCH_STABLELM,
|
210
210
|
LLM_ARCH_QWEN,
|
211
211
|
LLM_ARCH_QWEN2,
|
212
|
+
LLM_ARCH_QWEN2MOE,
|
212
213
|
LLM_ARCH_PHI2,
|
213
214
|
LLM_ARCH_PLAMO,
|
214
215
|
LLM_ARCH_CODESHELL,
|
@@ -220,6 +221,8 @@ enum llm_arch {
|
|
220
221
|
LLM_ARCH_MAMBA,
|
221
222
|
LLM_ARCH_XVERSE,
|
222
223
|
LLM_ARCH_COMMAND_R,
|
224
|
+
LLM_ARCH_DBRX,
|
225
|
+
LLM_ARCH_OLMO,
|
223
226
|
LLM_ARCH_UNKNOWN,
|
224
227
|
};
|
225
228
|
|
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
241
244
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
242
245
|
{ LLM_ARCH_QWEN, "qwen" },
|
243
246
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
244
248
|
{ LLM_ARCH_PHI2, "phi2" },
|
245
249
|
{ LLM_ARCH_PLAMO, "plamo" },
|
246
250
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
256
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
257
|
{ LLM_ARCH_XVERSE, "xverse" },
|
254
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
255
261
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
256
262
|
};
|
257
263
|
|
@@ -325,6 +331,10 @@ enum llm_kv {
|
|
325
331
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
326
332
|
LLM_KV_TOKENIZER_HF_JSON,
|
327
333
|
LLM_KV_TOKENIZER_RWKV,
|
334
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
335
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
336
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
337
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
328
338
|
};
|
329
339
|
|
330
340
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -397,6 +407,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
397
407
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
398
408
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
399
409
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
410
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
411
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
412
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
413
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
400
414
|
};
|
401
415
|
|
402
416
|
struct LLM_KV {
|
@@ -427,6 +441,7 @@ enum llm_tensor {
|
|
427
441
|
LLM_TENSOR_ATTN_OUT_NORM,
|
428
442
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
429
443
|
LLM_TENSOR_FFN_GATE_INP,
|
444
|
+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
430
445
|
LLM_TENSOR_FFN_NORM,
|
431
446
|
LLM_TENSOR_FFN_GATE,
|
432
447
|
LLM_TENSOR_FFN_DOWN,
|
@@ -438,6 +453,9 @@ enum llm_tensor {
|
|
438
453
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
439
454
|
LLM_TENSOR_FFN_GATE_EXPS,
|
440
455
|
LLM_TENSOR_FFN_UP_EXPS,
|
456
|
+
LLM_TENSOR_FFN_DOWN_SHEXP,
|
457
|
+
LLM_TENSOR_FFN_GATE_SHEXP,
|
458
|
+
LLM_TENSOR_FFN_UP_SHEXP,
|
441
459
|
LLM_TENSOR_ATTN_Q_NORM,
|
442
460
|
LLM_TENSOR_ATTN_K_NORM,
|
443
461
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -700,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
700
718
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
701
719
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
702
720
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
721
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
722
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
703
723
|
},
|
704
724
|
},
|
705
725
|
{
|
@@ -735,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
735
755
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
736
756
|
},
|
737
757
|
},
|
758
|
+
{
|
759
|
+
LLM_ARCH_QWEN2MOE,
|
760
|
+
{
|
761
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
762
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
763
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
764
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
765
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
766
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
767
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
768
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
769
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
770
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
771
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
772
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
773
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
774
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
777
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
778
|
+
},
|
779
|
+
},
|
738
780
|
{
|
739
781
|
LLM_ARCH_PHI2,
|
740
782
|
{
|
@@ -934,6 +976,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
934
976
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
935
977
|
},
|
936
978
|
},
|
979
|
+
{
|
980
|
+
LLM_ARCH_DBRX,
|
981
|
+
{
|
982
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
983
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
984
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
985
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
986
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
987
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
988
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
989
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
990
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
991
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
992
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
993
|
+
},
|
994
|
+
},
|
995
|
+
{
|
996
|
+
LLM_ARCH_OLMO,
|
997
|
+
{
|
998
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
999
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1000
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1001
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1002
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1003
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1004
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1005
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1006
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1007
|
+
},
|
1008
|
+
},
|
937
1009
|
{
|
938
1010
|
LLM_ARCH_UNKNOWN,
|
939
1011
|
{
|
@@ -1690,6 +1762,7 @@ enum e_model {
|
|
1690
1762
|
MODEL_4B,
|
1691
1763
|
MODEL_7B,
|
1692
1764
|
MODEL_8B,
|
1765
|
+
MODEL_12B,
|
1693
1766
|
MODEL_13B,
|
1694
1767
|
MODEL_14B,
|
1695
1768
|
MODEL_15B,
|
@@ -1705,8 +1778,10 @@ enum e_model {
|
|
1705
1778
|
MODEL_MEDIUM,
|
1706
1779
|
MODEL_LARGE,
|
1707
1780
|
MODEL_XL,
|
1781
|
+
MODEL_A2_7B,
|
1708
1782
|
MODEL_8x7B,
|
1709
1783
|
MODEL_8x22B,
|
1784
|
+
MODEL_16x12B,
|
1710
1785
|
};
|
1711
1786
|
|
1712
1787
|
static const size_t kiB = 1024;
|
@@ -1890,6 +1965,12 @@ struct llama_layer {
|
|
1890
1965
|
struct ggml_tensor * ffn_down_exps;
|
1891
1966
|
struct ggml_tensor * ffn_up_exps ;
|
1892
1967
|
|
1968
|
+
// ff shared expert (shexp)
|
1969
|
+
struct ggml_tensor * ffn_gate_inp_shexp;
|
1970
|
+
struct ggml_tensor * ffn_gate_shexp;
|
1971
|
+
struct ggml_tensor * ffn_down_shexp;
|
1972
|
+
struct ggml_tensor * ffn_up_shexp;
|
1973
|
+
|
1893
1974
|
// ff bias
|
1894
1975
|
struct ggml_tensor * ffn_down_b; // b2
|
1895
1976
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -2036,10 +2117,10 @@ struct llama_vocab {
|
|
2036
2117
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2037
2118
|
|
2038
2119
|
id linefeed_id = 13;
|
2039
|
-
id special_prefix_id =
|
2040
|
-
id
|
2041
|
-
id
|
2042
|
-
id special_eot_id =
|
2120
|
+
id special_prefix_id = -1;
|
2121
|
+
id special_suffix_id = -1;
|
2122
|
+
id special_middle_id = -1;
|
2123
|
+
id special_eot_id = -1;
|
2043
2124
|
|
2044
2125
|
bool add_space_prefix = true;
|
2045
2126
|
|
@@ -3545,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3545
3626
|
case MODEL_3B: return "3B";
|
3546
3627
|
case MODEL_7B: return "7B";
|
3547
3628
|
case MODEL_8B: return "8B";
|
3629
|
+
case MODEL_12B: return "12B";
|
3548
3630
|
case MODEL_13B: return "13B";
|
3549
3631
|
case MODEL_14B: return "14B";
|
3550
3632
|
case MODEL_15B: return "15B";
|
@@ -3560,8 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
|
|
3560
3642
|
case MODEL_MEDIUM: return "0.4B";
|
3561
3643
|
case MODEL_LARGE: return "0.8B";
|
3562
3644
|
case MODEL_XL: return "1.5B";
|
3645
|
+
case MODEL_A2_7B: return "A2.7B";
|
3563
3646
|
case MODEL_8x7B: return "8x7B";
|
3564
3647
|
case MODEL_8x22B: return "8x22B";
|
3648
|
+
case MODEL_16x12B: return "16x12B";
|
3565
3649
|
default: return "?B";
|
3566
3650
|
}
|
3567
3651
|
}
|
@@ -3834,6 +3918,7 @@ static void llm_load_hparams(
|
|
3834
3918
|
switch (hparams.n_layer) {
|
3835
3919
|
case 24: model.type = e_model::MODEL_1B; break;
|
3836
3920
|
case 32: model.type = e_model::MODEL_3B; break;
|
3921
|
+
case 40: model.type = e_model::MODEL_12B; break;
|
3837
3922
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3838
3923
|
}
|
3839
3924
|
} break;
|
@@ -3858,6 +3943,14 @@ static void llm_load_hparams(
|
|
3858
3943
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3859
3944
|
}
|
3860
3945
|
} break;
|
3946
|
+
case LLM_ARCH_QWEN2MOE:
|
3947
|
+
{
|
3948
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3949
|
+
switch (hparams.n_layer) {
|
3950
|
+
case 24: model.type = e_model::MODEL_A2_7B; break;
|
3951
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3952
|
+
}
|
3953
|
+
} break;
|
3861
3954
|
case LLM_ARCH_PHI2:
|
3862
3955
|
{
|
3863
3956
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3983,6 +4076,28 @@ static void llm_load_hparams(
|
|
3983
4076
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3984
4077
|
}
|
3985
4078
|
} break;
|
4079
|
+
case LLM_ARCH_DBRX:
|
4080
|
+
{
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4082
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
4083
|
+
|
4084
|
+
switch (hparams.n_layer) {
|
4085
|
+
case 40: model.type = e_model::MODEL_16x12B; break;
|
4086
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4087
|
+
}
|
4088
|
+
} break;
|
4089
|
+
case LLM_ARCH_OLMO:
|
4090
|
+
{
|
4091
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4092
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
4093
|
+
|
4094
|
+
switch (hparams.n_layer) {
|
4095
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
4096
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
4097
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
4098
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4099
|
+
}
|
4100
|
+
} break;
|
3986
4101
|
default: (void)0;
|
3987
4102
|
}
|
3988
4103
|
|
@@ -4042,6 +4157,32 @@ static void llm_load_vocab(
|
|
4042
4157
|
vocab.special_cls_id = -1;
|
4043
4158
|
vocab.special_mask_id = -1;
|
4044
4159
|
|
4160
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4161
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4162
|
+
// will allow those models to continue to work. The general names
|
4163
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4164
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4165
|
+
// new versions of these models have been published.
|
4166
|
+
std::string gen_name;
|
4167
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4168
|
+
|
4169
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4170
|
+
[](unsigned char c){ return std::tolower(c); });
|
4171
|
+
|
4172
|
+
if (gen_name.find("code") != std::string::npos) {
|
4173
|
+
if (model.arch == LLM_ARCH_LLAMA) {
|
4174
|
+
vocab.special_prefix_id = 32007;
|
4175
|
+
vocab.special_suffix_id = 32008;
|
4176
|
+
vocab.special_middle_id = 32009;
|
4177
|
+
vocab.special_eot_id = 32010;
|
4178
|
+
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4179
|
+
vocab.special_prefix_id = 67;
|
4180
|
+
vocab.special_suffix_id = 69;
|
4181
|
+
vocab.special_middle_id = 68;
|
4182
|
+
vocab.special_eot_id = 70;
|
4183
|
+
}
|
4184
|
+
}
|
4185
|
+
|
4045
4186
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4046
4187
|
if (add_space_prefix_keyidx != -1) {
|
4047
4188
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
@@ -4155,13 +4296,17 @@ static void llm_load_vocab(
|
|
4155
4296
|
// special tokens
|
4156
4297
|
{
|
4157
4298
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4158
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4159
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4160
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4161
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4162
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4163
|
-
{ LLM_KV_TOKENIZER_CLS_ID,
|
4164
|
-
{ LLM_KV_TOKENIZER_MASK_ID,
|
4299
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4300
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4301
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4302
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4303
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4304
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4305
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4306
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
4307
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
4308
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4165
4310
|
};
|
4166
4311
|
for (const auto & it : special_token_types) {
|
4167
4312
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4378,6 +4523,13 @@ static bool llm_load_tensors(
|
|
4378
4523
|
|
4379
4524
|
auto & hparams = model.hparams;
|
4380
4525
|
|
4526
|
+
#ifdef GGML_USE_SYCL
|
4527
|
+
// disable MoE with SYCL until mul_mat_id is updated
|
4528
|
+
if (hparams.n_expert > 0) {
|
4529
|
+
n_gpu_layers = 0;
|
4530
|
+
}
|
4531
|
+
#endif
|
4532
|
+
|
4381
4533
|
model.split_mode = split_mode;
|
4382
4534
|
model.main_gpu = main_gpu;
|
4383
4535
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -4475,7 +4627,7 @@ static bool llm_load_tensors(
|
|
4475
4627
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4476
4628
|
|
4477
4629
|
// for moe merged tensors
|
4478
|
-
ctx_size += ggml_tensor_overhead()*
|
4630
|
+
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
4479
4631
|
|
4480
4632
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4481
4633
|
for (auto & it : buft_layer_count) {
|
@@ -4671,6 +4823,39 @@ static bool llm_load_tensors(
|
|
4671
4823
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4672
4824
|
}
|
4673
4825
|
} break;
|
4826
|
+
case LLM_ARCH_DBRX:
|
4827
|
+
{
|
4828
|
+
if (n_expert == 0) {
|
4829
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
4830
|
+
}
|
4831
|
+
|
4832
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4833
|
+
|
4834
|
+
// output
|
4835
|
+
{
|
4836
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4837
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4838
|
+
}
|
4839
|
+
|
4840
|
+
for (int i = 0; i < n_layer; ++i) {
|
4841
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4842
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4843
|
+
|
4844
|
+
auto & layer = model.layers[i];
|
4845
|
+
|
4846
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4847
|
+
|
4848
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4849
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4850
|
+
|
4851
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4852
|
+
|
4853
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4854
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4855
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
4856
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4857
|
+
}
|
4858
|
+
} break;
|
4674
4859
|
case LLM_ARCH_BAICHUAN:
|
4675
4860
|
{
|
4676
4861
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -4985,8 +5170,13 @@ static bool llm_load_tensors(
|
|
4985
5170
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
4986
5171
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
4987
5172
|
|
4988
|
-
|
4989
|
-
layer.
|
5173
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
5174
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
5175
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
5176
|
+
|
5177
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5178
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
5179
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4990
5180
|
|
4991
5181
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4992
5182
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5029,7 +5219,13 @@ static bool llm_load_tensors(
|
|
5029
5219
|
// output
|
5030
5220
|
{
|
5031
5221
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5032
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5222
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5223
|
+
// if output is NULL, init from the input tok embed
|
5224
|
+
if (model.output == NULL) {
|
5225
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5226
|
+
ml.n_created--; // artificial tensor
|
5227
|
+
ml.size_data += ggml_nbytes(model.output);
|
5228
|
+
}
|
5033
5229
|
}
|
5034
5230
|
|
5035
5231
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5057,6 +5253,54 @@ static bool llm_load_tensors(
|
|
5057
5253
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5058
5254
|
}
|
5059
5255
|
} break;
|
5256
|
+
case LLM_ARCH_QWEN2MOE:
|
5257
|
+
{
|
5258
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5259
|
+
|
5260
|
+
// output
|
5261
|
+
{
|
5262
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5263
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5264
|
+
}
|
5265
|
+
|
5266
|
+
for (int i = 0; i < n_layer; ++i) {
|
5267
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5268
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5269
|
+
|
5270
|
+
auto & layer = model.layers[i];
|
5271
|
+
|
5272
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5273
|
+
|
5274
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5275
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5276
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5277
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5278
|
+
|
5279
|
+
// optional bias tensors
|
5280
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5281
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5282
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5283
|
+
|
5284
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5285
|
+
|
5286
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5287
|
+
|
5288
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
5289
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
5290
|
+
|
5291
|
+
// MoE branch
|
5292
|
+
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
5293
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5294
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5295
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5296
|
+
|
5297
|
+
// Shared expert branch
|
5298
|
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5299
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
5300
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
5301
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
5302
|
+
}
|
5303
|
+
} break;
|
5060
5304
|
case LLM_ARCH_PHI2:
|
5061
5305
|
{
|
5062
5306
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5450,6 +5694,37 @@ static bool llm_load_tensors(
|
|
5450
5694
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5451
5695
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5452
5696
|
|
5697
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5698
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5699
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5700
|
+
}
|
5701
|
+
} break;
|
5702
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
5703
|
+
{
|
5704
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5705
|
+
|
5706
|
+
// output
|
5707
|
+
{
|
5708
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5709
|
+
// if output is NULL, init from the input tok embed
|
5710
|
+
if (model.output == NULL) {
|
5711
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5712
|
+
ml.n_created--; // artificial tensor
|
5713
|
+
ml.size_data += ggml_nbytes(model.output);
|
5714
|
+
}
|
5715
|
+
}
|
5716
|
+
|
5717
|
+
for (int i = 0; i < n_layer; ++i) {
|
5718
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5719
|
+
|
5720
|
+
auto & layer = model.layers[i];
|
5721
|
+
|
5722
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5723
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5724
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5725
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5726
|
+
|
5727
|
+
|
5453
5728
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5454
5729
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5455
5730
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
@@ -5890,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
5890
6165
|
return cur;
|
5891
6166
|
}
|
5892
6167
|
|
6168
|
+
static struct ggml_tensor * llm_build_moe_ffn(
|
6169
|
+
struct ggml_context * ctx,
|
6170
|
+
struct ggml_tensor * cur,
|
6171
|
+
struct ggml_tensor * gate_inp,
|
6172
|
+
struct ggml_tensor * up_exps,
|
6173
|
+
struct ggml_tensor * gate_exps,
|
6174
|
+
struct ggml_tensor * down_exps,
|
6175
|
+
int64_t n_expert,
|
6176
|
+
int64_t n_expert_used,
|
6177
|
+
llm_ffn_op_type type_op,
|
6178
|
+
bool norm_w,
|
6179
|
+
const llm_build_cb & cb,
|
6180
|
+
int il) {
|
6181
|
+
int64_t n_embd = cur->ne[0];
|
6182
|
+
int64_t n_tokens = cur->ne[1];
|
6183
|
+
|
6184
|
+
ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
6185
|
+
cb(logits, "ffn_moe_logits", il);
|
6186
|
+
|
6187
|
+
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
6188
|
+
cb(probs, "ffn_moe_probs", il);
|
6189
|
+
|
6190
|
+
// select experts
|
6191
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
6192
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6193
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
6194
|
+
|
6195
|
+
ggml_tensor * weights = ggml_get_rows(ctx,
|
6196
|
+
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
6197
|
+
cb(weights, "ffn_moe_weights", il);
|
6198
|
+
|
6199
|
+
if (norm_w) {
|
6200
|
+
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
6201
|
+
|
6202
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
6203
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6204
|
+
|
6205
|
+
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
6206
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6207
|
+
|
6208
|
+
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6209
|
+
}
|
6210
|
+
|
6211
|
+
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6212
|
+
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6213
|
+
cb(up, "ffn_moe_up", il);
|
6214
|
+
|
6215
|
+
ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6216
|
+
cb(gate, "ffn_moe_gate", il);
|
6217
|
+
|
6218
|
+
switch (type_op) {
|
6219
|
+
case LLM_FFN_SILU:
|
6220
|
+
{
|
6221
|
+
gate = ggml_silu(ctx, gate);
|
6222
|
+
cb(gate, "ffn_moe_silu", il);
|
6223
|
+
} break;
|
6224
|
+
case LLM_FFN_GELU:
|
6225
|
+
{
|
6226
|
+
gate = ggml_gelu(ctx, gate);
|
6227
|
+
cb(gate, "ffn_moe_gelu", il);
|
6228
|
+
} break;
|
6229
|
+
default:
|
6230
|
+
GGML_ASSERT(false);
|
6231
|
+
}
|
6232
|
+
|
6233
|
+
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
6234
|
+
cb(par, "ffn_moe_gate_par", il);
|
6235
|
+
|
6236
|
+
ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
6237
|
+
cb(experts, "ffn_moe_down", il);
|
6238
|
+
|
6239
|
+
experts = ggml_mul(ctx, experts, weights);
|
6240
|
+
|
6241
|
+
// aggregate experts
|
6242
|
+
ggml_tensor * moe_out = nullptr;
|
6243
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6244
|
+
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
6245
|
+
experts->nb[2], i*experts->nb[1]);
|
6246
|
+
|
6247
|
+
if (i == 0) {
|
6248
|
+
moe_out = cur_expert;
|
6249
|
+
} else {
|
6250
|
+
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
6251
|
+
}
|
6252
|
+
}
|
6253
|
+
|
6254
|
+
if (n_expert_used == 1) {
|
6255
|
+
// avoid returning a non-contiguous tensor
|
6256
|
+
moe_out = ggml_cont(ctx, moe_out);
|
6257
|
+
}
|
6258
|
+
|
6259
|
+
return moe_out;
|
6260
|
+
}
|
6261
|
+
|
5893
6262
|
// if max_alibi_bias > 0 then apply ALiBi
|
5894
6263
|
static struct ggml_tensor * llm_build_kqv(
|
5895
6264
|
struct ggml_context * ctx,
|
@@ -6433,62 +6802,15 @@ struct llm_build_context {
|
|
6433
6802
|
LLM_NORM_RMS, cb, il);
|
6434
6803
|
cb(cur, "ffn_norm", il);
|
6435
6804
|
|
6436
|
-
|
6437
|
-
|
6438
|
-
|
6439
|
-
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
cb(
|
6445
|
-
|
6446
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6447
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6448
|
-
cb(weights, "ffn_moe_weights", il);
|
6449
|
-
|
6450
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6451
|
-
|
6452
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6453
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6454
|
-
|
6455
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6456
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6457
|
-
|
6458
|
-
// compute expert outputs
|
6459
|
-
ggml_tensor * moe_out = nullptr;
|
6460
|
-
|
6461
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6462
|
-
ggml_tensor * cur_expert;
|
6463
|
-
|
6464
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6465
|
-
cb(cur_up, "ffn_moe_up", il);
|
6466
|
-
|
6467
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6468
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6469
|
-
|
6470
|
-
cur_gate = ggml_silu(ctx0, cur_gate);
|
6471
|
-
cb(cur_gate, "ffn_moe_silu", il);
|
6472
|
-
|
6473
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6474
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6475
|
-
|
6476
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6477
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6478
|
-
|
6479
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6480
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6481
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6482
|
-
|
6483
|
-
if (i == 0) {
|
6484
|
-
moe_out = cur_expert;
|
6485
|
-
} else {
|
6486
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6487
|
-
cb(moe_out, "ffn_moe_out", il);
|
6488
|
-
}
|
6489
|
-
}
|
6490
|
-
|
6491
|
-
cur = moe_out;
|
6805
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
6806
|
+
model.layers[il].ffn_gate_inp,
|
6807
|
+
model.layers[il].ffn_up_exps,
|
6808
|
+
model.layers[il].ffn_gate_exps,
|
6809
|
+
model.layers[il].ffn_down_exps,
|
6810
|
+
n_expert, n_expert_used,
|
6811
|
+
LLM_FFN_SILU, true,
|
6812
|
+
cb, il);
|
6813
|
+
cb(cur, "ffn_moe_out", il);
|
6492
6814
|
}
|
6493
6815
|
|
6494
6816
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -6967,74 +7289,158 @@ struct llm_build_context {
|
|
6967
7289
|
LLM_NORM_RMS, cb, il);
|
6968
7290
|
cb(cur, "ffn_norm", il);
|
6969
7291
|
|
6970
|
-
|
6971
|
-
|
7292
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7293
|
+
model.layers[il].ffn_gate_inp,
|
7294
|
+
model.layers[il].ffn_up_exps,
|
7295
|
+
model.layers[il].ffn_gate_exps,
|
7296
|
+
model.layers[il].ffn_down_exps,
|
7297
|
+
n_expert, n_expert_used,
|
7298
|
+
LLM_FFN_GELU, true,
|
7299
|
+
cb, il);
|
7300
|
+
cb(cur, "ffn_moe_out", il);
|
7301
|
+
|
7302
|
+
// Grok
|
7303
|
+
// if layer_out_norm is present then apply it before adding the input
|
7304
|
+
// Idea: maybe ffn_out_norm is a better name
|
7305
|
+
if (model.layers[il].layer_out_norm) {
|
7306
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7307
|
+
model.layers[il].layer_out_norm, NULL,
|
7308
|
+
LLM_NORM_RMS, cb, il);
|
7309
|
+
cb(cur, "layer_out_norm", il);
|
7310
|
+
}
|
7311
|
+
|
7312
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7313
|
+
cb(cur, "ffn_out", il);
|
7314
|
+
|
7315
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7316
|
+
if (layer_dir != nullptr) {
|
7317
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7318
|
+
}
|
7319
|
+
cb(cur, "l_out", il);
|
7320
|
+
|
7321
|
+
// input for next layer
|
7322
|
+
inpL = cur;
|
7323
|
+
}
|
6972
7324
|
|
6973
|
-
|
6974
|
-
cb(probs, "ffn_moe_probs", il);
|
7325
|
+
cur = inpL;
|
6975
7326
|
|
6976
|
-
|
6977
|
-
|
6978
|
-
|
7327
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7328
|
+
model.output_norm, NULL,
|
7329
|
+
LLM_NORM_RMS, cb, -1);
|
7330
|
+
cb(cur, "result_norm", -1);
|
6979
7331
|
|
6980
|
-
|
6981
|
-
|
6982
|
-
cb(weights, "ffn_moe_weights", il);
|
7332
|
+
// lm_head
|
7333
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6983
7334
|
|
6984
|
-
|
7335
|
+
// Grok
|
7336
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
6985
7337
|
|
6986
|
-
|
6987
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
7338
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
6988
7339
|
|
6989
|
-
|
6990
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
7340
|
+
cb(cur, "result_output", -1);
|
6991
7341
|
|
6992
|
-
|
6993
|
-
ggml_tensor * moe_out = nullptr;
|
7342
|
+
ggml_build_forward_expand(gf, cur);
|
6994
7343
|
|
6995
|
-
|
6996
|
-
|
7344
|
+
return gf;
|
7345
|
+
}
|
6997
7346
|
|
6998
|
-
|
6999
|
-
|
7347
|
+
struct ggml_cgraph * build_dbrx() {
|
7348
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7000
7349
|
|
7001
|
-
|
7002
|
-
|
7350
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7351
|
+
int32_t n_tokens = this->n_tokens;
|
7003
7352
|
|
7004
|
-
|
7005
|
-
|
7006
|
-
|
7353
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7354
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7355
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7356
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
7007
7357
|
|
7008
|
-
|
7009
|
-
|
7358
|
+
struct ggml_tensor * cur;
|
7359
|
+
struct ggml_tensor * inpL;
|
7010
7360
|
|
7011
|
-
|
7012
|
-
cb(cur_expert, "ffn_moe_down", il);
|
7361
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7013
7362
|
|
7014
|
-
|
7015
|
-
|
7016
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
7363
|
+
// inp_pos - contains the positions
|
7364
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7017
7365
|
|
7018
|
-
|
7019
|
-
|
7020
|
-
} else {
|
7021
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
7022
|
-
cb(moe_out, "ffn_moe_out", il);
|
7023
|
-
}
|
7024
|
-
}
|
7366
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7367
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7025
7368
|
|
7026
|
-
|
7369
|
+
for (int il = 0; il < n_layer; ++il) {
|
7370
|
+
struct ggml_tensor * inpSA = inpL;
|
7027
7371
|
|
7028
|
-
//
|
7029
|
-
|
7030
|
-
|
7031
|
-
|
7032
|
-
|
7033
|
-
|
7034
|
-
|
7035
|
-
|
7372
|
+
// norm
|
7373
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7374
|
+
model.layers[il].attn_norm, NULL,
|
7375
|
+
LLM_NORM, cb, il);
|
7376
|
+
cb(cur, "attn_norm", il);
|
7377
|
+
|
7378
|
+
// self-attention
|
7379
|
+
{
|
7380
|
+
struct ggml_tensor * Qcur = nullptr;
|
7381
|
+
struct ggml_tensor * Kcur = nullptr;
|
7382
|
+
struct ggml_tensor * Vcur = nullptr;
|
7383
|
+
|
7384
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7385
|
+
cb(cur, "wqkv", il);
|
7386
|
+
|
7387
|
+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
7388
|
+
cb(cur, "wqkv_clamped", il);
|
7389
|
+
|
7390
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7391
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7392
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7393
|
+
|
7394
|
+
cb(Qcur, "Qcur", il);
|
7395
|
+
cb(Kcur, "Kcur", il);
|
7396
|
+
cb(Vcur, "Vcur", il);
|
7397
|
+
|
7398
|
+
Qcur = ggml_rope_custom(
|
7399
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7401
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7402
|
+
);
|
7403
|
+
cb(Qcur, "Qcur", il);
|
7404
|
+
|
7405
|
+
Kcur = ggml_rope_custom(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7407
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7408
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7409
|
+
);
|
7410
|
+
cb(Kcur, "Kcur", il);
|
7411
|
+
|
7412
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7413
|
+
model.layers[il].wo, NULL,
|
7414
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7036
7415
|
}
|
7037
7416
|
|
7417
|
+
if (il == n_layer - 1) {
|
7418
|
+
// skip computing output for unused tokens
|
7419
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7420
|
+
n_tokens = n_outputs;
|
7421
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7422
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7423
|
+
}
|
7424
|
+
|
7425
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7426
|
+
cb(ffn_inp, "ffn_inp", il);
|
7427
|
+
|
7428
|
+
// feed-forward network
|
7429
|
+
// MoE branch
|
7430
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7431
|
+
model.layers[il].attn_out_norm, NULL,
|
7432
|
+
LLM_NORM, cb, il);
|
7433
|
+
cb(cur, "attn_out_norm", il);
|
7434
|
+
|
7435
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7436
|
+
model.layers[il].ffn_gate_inp,
|
7437
|
+
model.layers[il].ffn_up_exps,
|
7438
|
+
model.layers[il].ffn_gate_exps,
|
7439
|
+
model.layers[il].ffn_down_exps,
|
7440
|
+
n_expert, n_expert_used,
|
7441
|
+
LLM_FFN_SILU, true,
|
7442
|
+
cb, il);
|
7443
|
+
cb(cur, "ffn_moe_out", il);
|
7038
7444
|
|
7039
7445
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
7040
7446
|
cb(cur, "ffn_out", il);
|
@@ -7052,18 +7458,13 @@ struct llm_build_context {
|
|
7052
7458
|
cur = inpL;
|
7053
7459
|
|
7054
7460
|
cur = llm_build_norm(ctx0, cur, hparams,
|
7055
|
-
|
7056
|
-
|
7461
|
+
model.output_norm, NULL,
|
7462
|
+
LLM_NORM, cb, -1);
|
7057
7463
|
cb(cur, "result_norm", -1);
|
7058
7464
|
|
7059
7465
|
// lm_head
|
7060
7466
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7061
7467
|
|
7062
|
-
// Grok
|
7063
|
-
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7064
|
-
|
7065
|
-
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7066
|
-
|
7067
7468
|
cb(cur, "result_output", -1);
|
7068
7469
|
|
7069
7470
|
ggml_build_forward_expand(gf, cur);
|
@@ -7923,7 +8324,7 @@ struct llm_build_context {
|
|
7923
8324
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7924
8325
|
|
7925
8326
|
for (int il = 0; il < n_layer; ++il) {
|
7926
|
-
|
8327
|
+
|
7927
8328
|
|
7928
8329
|
// norm
|
7929
8330
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7932,6 +8333,8 @@ struct llm_build_context {
|
|
7932
8333
|
LLM_NORM, cb, il);
|
7933
8334
|
cb(cur, "attn_norm", il);
|
7934
8335
|
|
8336
|
+
struct ggml_tensor * inpSA = cur;
|
8337
|
+
|
7935
8338
|
// self-attention
|
7936
8339
|
{
|
7937
8340
|
// compute Q and K and RoPE them
|
@@ -7956,15 +8359,36 @@ struct llm_build_context {
|
|
7956
8359
|
cb(Vcur, "Vcur", il);
|
7957
8360
|
}
|
7958
8361
|
|
8362
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8363
|
+
cb(Qcur, "Qcur", il);
|
8364
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8365
|
+
cb(Kcur, "Kcur", il);
|
8366
|
+
|
8367
|
+
if (model.layers[il].attn_q_norm) {
|
8368
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8369
|
+
model.layers[il].attn_q_norm,
|
8370
|
+
NULL,
|
8371
|
+
LLM_NORM, cb, il);
|
8372
|
+
cb(Qcur, "Qcur", il);
|
8373
|
+
}
|
8374
|
+
if (model.layers[il].attn_k_norm) {
|
8375
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8376
|
+
model.layers[il].attn_k_norm,
|
8377
|
+
NULL,
|
8378
|
+
LLM_NORM, cb, il);
|
8379
|
+
cb(Kcur, "Kcur", il);
|
8380
|
+
}
|
8381
|
+
|
8382
|
+
|
7959
8383
|
Qcur = ggml_rope_custom(
|
7960
|
-
ctx0,
|
8384
|
+
ctx0, Qcur, inp_pos,
|
7961
8385
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7962
8386
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7963
8387
|
);
|
7964
8388
|
cb(Qcur, "Qcur", il);
|
7965
8389
|
|
7966
8390
|
Kcur = ggml_rope_custom(
|
7967
|
-
ctx0,
|
8391
|
+
ctx0, Kcur, inp_pos,
|
7968
8392
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7969
8393
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7970
8394
|
);
|
@@ -7979,20 +8403,25 @@ struct llm_build_context {
|
|
7979
8403
|
// skip computing output for unused tokens
|
7980
8404
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7981
8405
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8406
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7982
8407
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7983
8408
|
}
|
7984
8409
|
|
7985
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur,
|
8410
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7986
8411
|
cb(ffn_inp, "ffn_inp", il);
|
7987
8412
|
|
7988
8413
|
// feed-forward network
|
7989
8414
|
{
|
7990
|
-
|
7991
|
-
|
7992
|
-
|
7993
|
-
|
7994
|
-
|
7995
|
-
|
8415
|
+
if (model.layers[il].ffn_norm) {
|
8416
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8417
|
+
model.layers[il].ffn_norm,
|
8418
|
+
model.layers[il].ffn_norm_b,
|
8419
|
+
LLM_NORM, cb, il);
|
8420
|
+
cb(cur, "ffn_norm", il);
|
8421
|
+
} else {
|
8422
|
+
// parallel residual
|
8423
|
+
cur = inpSA;
|
8424
|
+
}
|
7996
8425
|
cur = llm_build_ffn(ctx0, cur,
|
7997
8426
|
model.layers[il].ffn_up, NULL,
|
7998
8427
|
model.layers[il].ffn_gate, NULL,
|
@@ -8182,12 +8611,6 @@ struct llm_build_context {
|
|
8182
8611
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8183
8612
|
cb(Vcur, "Vcur", il);
|
8184
8613
|
|
8185
|
-
// these nodes are added to the graph together so that they are not reordered
|
8186
|
-
// by doing so, the number of splits in the graph is reduced
|
8187
|
-
ggml_build_forward_expand(gf, Qcur);
|
8188
|
-
ggml_build_forward_expand(gf, Kcur);
|
8189
|
-
ggml_build_forward_expand(gf, Vcur);
|
8190
|
-
|
8191
8614
|
Qcur = ggml_rope_custom(
|
8192
8615
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8193
8616
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8254,6 +8677,150 @@ struct llm_build_context {
|
|
8254
8677
|
return gf;
|
8255
8678
|
}
|
8256
8679
|
|
8680
|
+
struct ggml_cgraph * build_qwen2moe() {
|
8681
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8682
|
+
|
8683
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8684
|
+
int32_t n_tokens = this->n_tokens;
|
8685
|
+
|
8686
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8687
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8688
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8689
|
+
|
8690
|
+
struct ggml_tensor * cur;
|
8691
|
+
struct ggml_tensor * inpL;
|
8692
|
+
|
8693
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8694
|
+
|
8695
|
+
// inp_pos - contains the positions
|
8696
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8697
|
+
|
8698
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8699
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8700
|
+
|
8701
|
+
for (int il = 0; il < n_layer; ++il) {
|
8702
|
+
struct ggml_tensor * inpSA = inpL;
|
8703
|
+
|
8704
|
+
// norm
|
8705
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8706
|
+
model.layers[il].attn_norm, NULL,
|
8707
|
+
LLM_NORM_RMS, cb, il);
|
8708
|
+
cb(cur, "attn_norm", il);
|
8709
|
+
|
8710
|
+
// self_attention
|
8711
|
+
{
|
8712
|
+
// compute Q and K and RoPE them
|
8713
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8714
|
+
cb(Qcur, "Qcur", il);
|
8715
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8716
|
+
cb(Qcur, "Qcur", il);
|
8717
|
+
|
8718
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8719
|
+
cb(Kcur, "Kcur", il);
|
8720
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8721
|
+
cb(Kcur, "Kcur", il);
|
8722
|
+
|
8723
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8724
|
+
cb(Vcur, "Vcur", il);
|
8725
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8726
|
+
cb(Vcur, "Vcur", il);
|
8727
|
+
|
8728
|
+
Qcur = ggml_rope_custom(
|
8729
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8730
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8731
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8732
|
+
);
|
8733
|
+
cb(Qcur, "Qcur", il);
|
8734
|
+
|
8735
|
+
Kcur = ggml_rope_custom(
|
8736
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8737
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8738
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8739
|
+
);
|
8740
|
+
cb(Kcur, "Kcur", il);
|
8741
|
+
|
8742
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8743
|
+
model.layers[il].wo, model.layers[il].bo,
|
8744
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8745
|
+
}
|
8746
|
+
|
8747
|
+
if (il == n_layer - 1) {
|
8748
|
+
// skip computing output for unused tokens
|
8749
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8750
|
+
n_tokens = n_outputs;
|
8751
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8752
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8753
|
+
}
|
8754
|
+
|
8755
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8756
|
+
cb(ffn_inp, "ffn_inp", il);
|
8757
|
+
|
8758
|
+
// MoE branch
|
8759
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8760
|
+
model.layers[il].ffn_norm, NULL,
|
8761
|
+
LLM_NORM_RMS, cb, il);
|
8762
|
+
cb(cur, "ffn_norm", il);
|
8763
|
+
|
8764
|
+
ggml_tensor * moe_out =
|
8765
|
+
llm_build_moe_ffn(ctx0, cur,
|
8766
|
+
model.layers[il].ffn_gate_inp,
|
8767
|
+
model.layers[il].ffn_up_exps,
|
8768
|
+
model.layers[il].ffn_gate_exps,
|
8769
|
+
model.layers[il].ffn_down_exps,
|
8770
|
+
n_expert, n_expert_used,
|
8771
|
+
LLM_FFN_SILU, false,
|
8772
|
+
cb, il);
|
8773
|
+
cb(cur, "ffn_moe_out", il);
|
8774
|
+
|
8775
|
+
// FFN shared expert
|
8776
|
+
{
|
8777
|
+
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
8778
|
+
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
8779
|
+
|
8780
|
+
// sigmoid
|
8781
|
+
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
8782
|
+
cb(cur_gate, "ffn_shexp_gate", il);
|
8783
|
+
|
8784
|
+
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
8785
|
+
model.layers[il].ffn_up_shexp, NULL,
|
8786
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
8787
|
+
model.layers[il].ffn_down_shexp, NULL,
|
8788
|
+
NULL,
|
8789
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
8790
|
+
cb(cur_ffn, "ffn_shexp", il);
|
8791
|
+
|
8792
|
+
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
8793
|
+
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
8794
|
+
|
8795
|
+
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
8796
|
+
cb(moe_out, "ffn_out", il);
|
8797
|
+
|
8798
|
+
cur = moe_out;
|
8799
|
+
}
|
8800
|
+
|
8801
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8802
|
+
cb(cur, "l_out", il);
|
8803
|
+
|
8804
|
+
// input for next layer
|
8805
|
+
inpL = cur;
|
8806
|
+
}
|
8807
|
+
|
8808
|
+
cur = inpL;
|
8809
|
+
|
8810
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8811
|
+
model.output_norm, NULL,
|
8812
|
+
LLM_NORM_RMS, cb, -1);
|
8813
|
+
cb(cur, "result_norm", -1);
|
8814
|
+
|
8815
|
+
// lm_head
|
8816
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8817
|
+
cb(cur, "result_output", -1);
|
8818
|
+
|
8819
|
+
ggml_build_forward_expand(gf, cur);
|
8820
|
+
|
8821
|
+
return gf;
|
8822
|
+
}
|
8823
|
+
|
8257
8824
|
struct ggml_cgraph * build_phi2() {
|
8258
8825
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8259
8826
|
|
@@ -9588,6 +10155,139 @@ struct llm_build_context {
|
|
9588
10155
|
return gf;
|
9589
10156
|
|
9590
10157
|
}
|
10158
|
+
|
10159
|
+
// ref: https://allenai.org/olmo
|
10160
|
+
// based on the original build_llama() function, changes:
|
10161
|
+
// * non-parametric layer norm
|
10162
|
+
// * clamp qkv
|
10163
|
+
// * removed bias
|
10164
|
+
// * removed MoE
|
10165
|
+
struct ggml_cgraph * build_olmo() {
|
10166
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10167
|
+
|
10168
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10169
|
+
int32_t n_tokens = this->n_tokens;
|
10170
|
+
|
10171
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10172
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10173
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10174
|
+
|
10175
|
+
struct ggml_tensor * cur;
|
10176
|
+
struct ggml_tensor * inpL;
|
10177
|
+
|
10178
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10179
|
+
|
10180
|
+
// inp_pos - contains the positions
|
10181
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10182
|
+
|
10183
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10184
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10185
|
+
|
10186
|
+
for (int il = 0; il < n_layer; ++il) {
|
10187
|
+
struct ggml_tensor * inpSA = inpL;
|
10188
|
+
|
10189
|
+
// norm
|
10190
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10191
|
+
NULL, NULL,
|
10192
|
+
LLM_NORM, cb, il);
|
10193
|
+
cb(cur, "attn_norm", il);
|
10194
|
+
|
10195
|
+
// self-attention
|
10196
|
+
{
|
10197
|
+
// compute Q and K and RoPE them
|
10198
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10199
|
+
cb(Qcur, "Qcur", il);
|
10200
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10201
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10202
|
+
cb(Qcur, "Qcur", il);
|
10203
|
+
}
|
10204
|
+
|
10205
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10206
|
+
cb(Kcur, "Kcur", il);
|
10207
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10208
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10209
|
+
cb(Kcur, "Kcur", il);
|
10210
|
+
}
|
10211
|
+
|
10212
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10213
|
+
cb(Vcur, "Vcur", il);
|
10214
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10215
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10216
|
+
cb(Vcur, "Vcur", il);
|
10217
|
+
}
|
10218
|
+
|
10219
|
+
Qcur = ggml_rope_custom(
|
10220
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10221
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10222
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10223
|
+
);
|
10224
|
+
cb(Qcur, "Qcur", il);
|
10225
|
+
|
10226
|
+
Kcur = ggml_rope_custom(
|
10227
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10228
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10229
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10230
|
+
);
|
10231
|
+
cb(Kcur, "Kcur", il);
|
10232
|
+
|
10233
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10234
|
+
model.layers[il].wo, nullptr,
|
10235
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10236
|
+
}
|
10237
|
+
|
10238
|
+
if (il == n_layer - 1) {
|
10239
|
+
// skip computing output for unused tokens
|
10240
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10241
|
+
n_tokens = n_outputs;
|
10242
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10243
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10244
|
+
}
|
10245
|
+
|
10246
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10247
|
+
cb(ffn_inp, "ffn_inp", il);
|
10248
|
+
|
10249
|
+
// feed-forward network
|
10250
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10251
|
+
NULL, NULL,
|
10252
|
+
LLM_NORM, cb, il);
|
10253
|
+
cb(cur, "ffn_norm", il);
|
10254
|
+
|
10255
|
+
cur = llm_build_ffn(ctx0, cur,
|
10256
|
+
model.layers[il].ffn_up, NULL,
|
10257
|
+
model.layers[il].ffn_gate, NULL,
|
10258
|
+
model.layers[il].ffn_down, NULL,
|
10259
|
+
NULL,
|
10260
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10261
|
+
cb(cur, "ffn_out", il);
|
10262
|
+
|
10263
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10264
|
+
cb(cur, "ffn_out", il);
|
10265
|
+
|
10266
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10267
|
+
if (layer_dir != nullptr) {
|
10268
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10269
|
+
}
|
10270
|
+
cb(cur, "l_out", il);
|
10271
|
+
|
10272
|
+
// input for next layer
|
10273
|
+
inpL = cur;
|
10274
|
+
}
|
10275
|
+
|
10276
|
+
cur = inpL;
|
10277
|
+
|
10278
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10279
|
+
NULL, NULL,
|
10280
|
+
LLM_NORM, cb, -1);
|
10281
|
+
cb(cur, "result_norm", -1);
|
10282
|
+
|
10283
|
+
// lm_head
|
10284
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10285
|
+
cb(cur, "result_output", -1);
|
10286
|
+
|
10287
|
+
ggml_build_forward_expand(gf, cur);
|
10288
|
+
|
10289
|
+
return gf;
|
10290
|
+
}
|
9591
10291
|
};
|
9592
10292
|
|
9593
10293
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -9737,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9737
10437
|
{
|
9738
10438
|
result = llm.build_qwen2();
|
9739
10439
|
} break;
|
10440
|
+
case LLM_ARCH_QWEN2MOE:
|
10441
|
+
{
|
10442
|
+
result = llm.build_qwen2moe();
|
10443
|
+
} break;
|
9740
10444
|
case LLM_ARCH_PHI2:
|
9741
10445
|
{
|
9742
10446
|
result = llm.build_phi2();
|
@@ -9785,6 +10489,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9785
10489
|
{
|
9786
10490
|
result = llm.build_command_r();
|
9787
10491
|
} break;
|
10492
|
+
case LLM_ARCH_DBRX:
|
10493
|
+
{
|
10494
|
+
result = llm.build_dbrx();
|
10495
|
+
} break;
|
10496
|
+
case LLM_ARCH_OLMO:
|
10497
|
+
{
|
10498
|
+
result = llm.build_olmo();
|
10499
|
+
} break;
|
9788
10500
|
default:
|
9789
10501
|
GGML_ASSERT(false);
|
9790
10502
|
}
|
@@ -12915,6 +13627,11 @@ struct llama_beam_search_data {
|
|
12915
13627
|
}
|
12916
13628
|
llama_logit_info logit_info(ctx);
|
12917
13629
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
13630
|
+
|
13631
|
+
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
13632
|
+
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
13633
|
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
13634
|
+
|
12918
13635
|
size_t i=0;
|
12919
13636
|
if (next_beams.size() < n_beams) {
|
12920
13637
|
for (; next_beams.size() < n_beams ; ++i) {
|
@@ -13535,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13535
14252
|
gguf_set_kv (ctx_out, ml.meta);
|
13536
14253
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
13537
14254
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
14255
|
+
// Remove split metadata
|
14256
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
14257
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
14258
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
13538
14259
|
|
13539
14260
|
if (params->kv_overrides) {
|
13540
14261
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
@@ -14629,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14629
15350
|
case LLM_ARCH_MINICPM:
|
14630
15351
|
case LLM_ARCH_XVERSE:
|
14631
15352
|
case LLM_ARCH_COMMAND_R:
|
15353
|
+
case LLM_ARCH_OLMO:
|
14632
15354
|
return LLAMA_ROPE_TYPE_NORM;
|
14633
15355
|
|
14634
15356
|
// the pairs of head values are offset by n_rot/2
|
14635
15357
|
case LLM_ARCH_FALCON:
|
14636
15358
|
case LLM_ARCH_GROK:
|
15359
|
+
case LLM_ARCH_DBRX:
|
14637
15360
|
case LLM_ARCH_PERSIMMON:
|
14638
15361
|
case LLM_ARCH_BERT:
|
14639
15362
|
case LLM_ARCH_NOMIC_BERT:
|
14640
15363
|
case LLM_ARCH_STABLELM:
|
14641
15364
|
case LLM_ARCH_QWEN:
|
14642
15365
|
case LLM_ARCH_QWEN2:
|
15366
|
+
case LLM_ARCH_QWEN2MOE:
|
14643
15367
|
case LLM_ARCH_PHI2:
|
14644
15368
|
case LLM_ARCH_GEMMA:
|
14645
15369
|
case LLM_ARCH_STARCODER2:
|
@@ -15320,6 +16044,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15320
16044
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15321
16045
|
ctx->output_ids[id] = i;
|
15322
16046
|
}
|
16047
|
+
|
16048
|
+
ctx->n_outputs = n_outputs;
|
15323
16049
|
}
|
15324
16050
|
}
|
15325
16051
|
|
@@ -16472,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
|
|
16472
17198
|
if (add_ass) {
|
16473
17199
|
ss << "### Response:\n";
|
16474
17200
|
}
|
17201
|
+
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
17202
|
+
// CohereForAI/c4ai-command-r-plus
|
17203
|
+
for (auto message : chat) {
|
17204
|
+
std::string role(message->role);
|
17205
|
+
if (role == "system") {
|
17206
|
+
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17207
|
+
} else if (role == "user") {
|
17208
|
+
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17209
|
+
} else if (role == "assistant") {
|
17210
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17211
|
+
}
|
17212
|
+
}
|
17213
|
+
if (add_ass) {
|
17214
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
|
+
}
|
16475
17216
|
} else {
|
16476
17217
|
// template not supported
|
16477
17218
|
return -1;
|