llama_cpp 0.14.5 → 0.14.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +24 -7
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +263 -5
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -294
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +151 -99
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +1308 -254
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +999 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -105,7 +105,7 @@
|
|
105
105
|
#endif
|
106
106
|
|
107
107
|
#define LLAMA_MAX_NODES 8192
|
108
|
-
#define LLAMA_MAX_EXPERTS
|
108
|
+
#define LLAMA_MAX_EXPERTS 60
|
109
109
|
|
110
110
|
|
111
111
|
//
|
@@ -209,7 +209,9 @@ enum llm_arch {
|
|
209
209
|
LLM_ARCH_STABLELM,
|
210
210
|
LLM_ARCH_QWEN,
|
211
211
|
LLM_ARCH_QWEN2,
|
212
|
+
LLM_ARCH_QWEN2MOE,
|
212
213
|
LLM_ARCH_PHI2,
|
214
|
+
LLM_ARCH_PHI3,
|
213
215
|
LLM_ARCH_PLAMO,
|
214
216
|
LLM_ARCH_CODESHELL,
|
215
217
|
LLM_ARCH_ORION,
|
@@ -220,6 +222,8 @@ enum llm_arch {
|
|
220
222
|
LLM_ARCH_MAMBA,
|
221
223
|
LLM_ARCH_XVERSE,
|
222
224
|
LLM_ARCH_COMMAND_R,
|
225
|
+
LLM_ARCH_DBRX,
|
226
|
+
LLM_ARCH_OLMO,
|
223
227
|
LLM_ARCH_UNKNOWN,
|
224
228
|
};
|
225
229
|
|
@@ -241,7 +245,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
241
245
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
242
246
|
{ LLM_ARCH_QWEN, "qwen" },
|
243
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
248
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
244
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
245
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
246
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
247
253
|
{ LLM_ARCH_ORION, "orion" },
|
@@ -252,6 +258,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
258
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
259
|
{ LLM_ARCH_XVERSE, "xverse" },
|
254
260
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
261
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
262
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
255
263
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
256
264
|
};
|
257
265
|
|
@@ -325,6 +333,10 @@ enum llm_kv {
|
|
325
333
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
326
334
|
LLM_KV_TOKENIZER_HF_JSON,
|
327
335
|
LLM_KV_TOKENIZER_RWKV,
|
336
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
337
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
338
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
339
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
328
340
|
};
|
329
341
|
|
330
342
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -397,6 +409,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
397
409
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
398
410
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
399
411
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
412
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
413
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
414
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
415
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
400
416
|
};
|
401
417
|
|
402
418
|
struct LLM_KV {
|
@@ -427,6 +443,7 @@ enum llm_tensor {
|
|
427
443
|
LLM_TENSOR_ATTN_OUT_NORM,
|
428
444
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
429
445
|
LLM_TENSOR_FFN_GATE_INP,
|
446
|
+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
430
447
|
LLM_TENSOR_FFN_NORM,
|
431
448
|
LLM_TENSOR_FFN_GATE,
|
432
449
|
LLM_TENSOR_FFN_DOWN,
|
@@ -438,6 +455,9 @@ enum llm_tensor {
|
|
438
455
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
439
456
|
LLM_TENSOR_FFN_GATE_EXPS,
|
440
457
|
LLM_TENSOR_FFN_UP_EXPS,
|
458
|
+
LLM_TENSOR_FFN_DOWN_SHEXP,
|
459
|
+
LLM_TENSOR_FFN_GATE_SHEXP,
|
460
|
+
LLM_TENSOR_FFN_UP_SHEXP,
|
441
461
|
LLM_TENSOR_ATTN_Q_NORM,
|
442
462
|
LLM_TENSOR_ATTN_K_NORM,
|
443
463
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -700,6 +720,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
700
720
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
701
721
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
702
722
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
723
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
724
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
703
725
|
},
|
704
726
|
},
|
705
727
|
{
|
@@ -735,6 +757,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
735
757
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
736
758
|
},
|
737
759
|
},
|
760
|
+
{
|
761
|
+
LLM_ARCH_QWEN2MOE,
|
762
|
+
{
|
763
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
764
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
765
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
766
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
767
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
768
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
769
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
770
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
771
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
772
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
773
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
774
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
775
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
776
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
777
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
778
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
779
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
780
|
+
},
|
781
|
+
},
|
738
782
|
{
|
739
783
|
LLM_ARCH_PHI2,
|
740
784
|
{
|
@@ -751,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
751
795
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
752
796
|
},
|
753
797
|
},
|
798
|
+
{
|
799
|
+
LLM_ARCH_PHI3,
|
800
|
+
{
|
801
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
802
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
803
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
804
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
805
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
806
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
807
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
808
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
809
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
810
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
811
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
812
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
813
|
+
},
|
814
|
+
},
|
754
815
|
{
|
755
816
|
LLM_ARCH_PLAMO,
|
756
817
|
{
|
@@ -934,6 +995,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
934
995
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
935
996
|
},
|
936
997
|
},
|
998
|
+
{
|
999
|
+
LLM_ARCH_DBRX,
|
1000
|
+
{
|
1001
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1002
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1003
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1004
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
1005
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1006
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1007
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
1008
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1009
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1010
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1011
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1012
|
+
},
|
1013
|
+
},
|
1014
|
+
{
|
1015
|
+
LLM_ARCH_OLMO,
|
1016
|
+
{
|
1017
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1018
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1019
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1020
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1021
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1022
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1023
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1024
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1025
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1026
|
+
},
|
1027
|
+
},
|
937
1028
|
{
|
938
1029
|
LLM_ARCH_UNKNOWN,
|
939
1030
|
{
|
@@ -1528,12 +1619,12 @@ struct llama_mlock {
|
|
1528
1619
|
};
|
1529
1620
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1530
1621
|
|
1531
|
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1622
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1532
1623
|
std::vector<char> result(8, 0);
|
1533
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1624
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1534
1625
|
if (n_tokens < 0) {
|
1535
1626
|
result.resize(-n_tokens);
|
1536
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1627
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1537
1628
|
GGML_ASSERT(check == -n_tokens);
|
1538
1629
|
}
|
1539
1630
|
else {
|
@@ -1690,6 +1781,7 @@ enum e_model {
|
|
1690
1781
|
MODEL_4B,
|
1691
1782
|
MODEL_7B,
|
1692
1783
|
MODEL_8B,
|
1784
|
+
MODEL_12B,
|
1693
1785
|
MODEL_13B,
|
1694
1786
|
MODEL_14B,
|
1695
1787
|
MODEL_15B,
|
@@ -1705,8 +1797,10 @@ enum e_model {
|
|
1705
1797
|
MODEL_MEDIUM,
|
1706
1798
|
MODEL_LARGE,
|
1707
1799
|
MODEL_XL,
|
1800
|
+
MODEL_A2_7B,
|
1708
1801
|
MODEL_8x7B,
|
1709
1802
|
MODEL_8x22B,
|
1803
|
+
MODEL_16x12B,
|
1710
1804
|
};
|
1711
1805
|
|
1712
1806
|
static const size_t kiB = 1024;
|
@@ -1890,6 +1984,12 @@ struct llama_layer {
|
|
1890
1984
|
struct ggml_tensor * ffn_down_exps;
|
1891
1985
|
struct ggml_tensor * ffn_up_exps ;
|
1892
1986
|
|
1987
|
+
// ff shared expert (shexp)
|
1988
|
+
struct ggml_tensor * ffn_gate_inp_shexp;
|
1989
|
+
struct ggml_tensor * ffn_gate_shexp;
|
1990
|
+
struct ggml_tensor * ffn_down_shexp;
|
1991
|
+
struct ggml_tensor * ffn_up_shexp;
|
1992
|
+
|
1893
1993
|
// ff bias
|
1894
1994
|
struct ggml_tensor * ffn_down_b; // b2
|
1895
1995
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -2036,10 +2136,10 @@ struct llama_vocab {
|
|
2036
2136
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2037
2137
|
|
2038
2138
|
id linefeed_id = 13;
|
2039
|
-
id special_prefix_id =
|
2040
|
-
id
|
2041
|
-
id
|
2042
|
-
id special_eot_id =
|
2139
|
+
id special_prefix_id = -1;
|
2140
|
+
id special_suffix_id = -1;
|
2141
|
+
id special_middle_id = -1;
|
2142
|
+
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2043
2143
|
|
2044
2144
|
bool add_space_prefix = true;
|
2045
2145
|
|
@@ -2899,9 +2999,13 @@ struct llama_model_loader {
|
|
2899
2999
|
|
2900
3000
|
ggml_tensor * tensor;
|
2901
3001
|
|
2902
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
3002
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2903
3003
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2904
3004
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
3005
|
+
|
3006
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
3007
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
3008
|
+
}
|
2905
3009
|
}
|
2906
3010
|
};
|
2907
3011
|
std::vector<llama_tensor_weight> weights;
|
@@ -2940,15 +3044,15 @@ struct llama_model_loader {
|
|
2940
3044
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2941
3045
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2942
3046
|
|
3047
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3048
|
+
contexts.emplace_back(ctx);
|
3049
|
+
|
2943
3050
|
// Save tensors data offset of the main file.
|
2944
3051
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
2945
3052
|
// so we build a unified tensors index for weights.
|
2946
3053
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2947
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
3054
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
2948
3055
|
}
|
2949
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2950
|
-
contexts.emplace_back(ctx);
|
2951
|
-
|
2952
3056
|
uint16_t n_split = 0;
|
2953
3057
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2954
3058
|
|
@@ -2982,12 +3086,13 @@ struct llama_model_loader {
|
|
2982
3086
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2983
3087
|
}
|
2984
3088
|
|
3089
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
3090
|
+
contexts.emplace_back(ctx);
|
3091
|
+
|
2985
3092
|
// Save tensors data offset info of the shard.
|
2986
3093
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2987
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
3094
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
2988
3095
|
}
|
2989
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
2990
|
-
contexts.emplace_back(ctx);
|
2991
3096
|
|
2992
3097
|
gguf_free(ctx_gguf);
|
2993
3098
|
}
|
@@ -3197,6 +3302,10 @@ struct llama_model_loader {
|
|
3197
3302
|
return nullptr;
|
3198
3303
|
}
|
3199
3304
|
|
3305
|
+
const llama_tensor_weight * get_weight(int i) const {
|
3306
|
+
return get_weight(get_tensor_name(i));
|
3307
|
+
}
|
3308
|
+
|
3200
3309
|
const llama_tensor_weight & require_weight(const char * name) const {
|
3201
3310
|
const llama_tensor_weight * weight = get_weight(name);
|
3202
3311
|
if (!weight) {
|
@@ -3545,6 +3654,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3545
3654
|
case MODEL_3B: return "3B";
|
3546
3655
|
case MODEL_7B: return "7B";
|
3547
3656
|
case MODEL_8B: return "8B";
|
3657
|
+
case MODEL_12B: return "12B";
|
3548
3658
|
case MODEL_13B: return "13B";
|
3549
3659
|
case MODEL_14B: return "14B";
|
3550
3660
|
case MODEL_15B: return "15B";
|
@@ -3560,8 +3670,10 @@ static const char * llama_model_type_name(e_model type) {
|
|
3560
3670
|
case MODEL_MEDIUM: return "0.4B";
|
3561
3671
|
case MODEL_LARGE: return "0.8B";
|
3562
3672
|
case MODEL_XL: return "1.5B";
|
3673
|
+
case MODEL_A2_7B: return "A2.7B";
|
3563
3674
|
case MODEL_8x7B: return "8x7B";
|
3564
3675
|
case MODEL_8x22B: return "8x22B";
|
3676
|
+
case MODEL_16x12B: return "16x12B";
|
3565
3677
|
default: return "?B";
|
3566
3678
|
}
|
3567
3679
|
}
|
@@ -3686,7 +3798,7 @@ static void llm_load_hparams(
|
|
3686
3798
|
switch (hparams.n_layer) {
|
3687
3799
|
case 22: model.type = e_model::MODEL_1B; break;
|
3688
3800
|
case 26: model.type = e_model::MODEL_3B; break;
|
3689
|
-
case 32: model.type = e_model::MODEL_7B; break;
|
3801
|
+
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
3690
3802
|
case 40: model.type = e_model::MODEL_13B; break;
|
3691
3803
|
case 48: model.type = e_model::MODEL_34B; break;
|
3692
3804
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3834,6 +3946,7 @@ static void llm_load_hparams(
|
|
3834
3946
|
switch (hparams.n_layer) {
|
3835
3947
|
case 24: model.type = e_model::MODEL_1B; break;
|
3836
3948
|
case 32: model.type = e_model::MODEL_3B; break;
|
3949
|
+
case 40: model.type = e_model::MODEL_12B; break;
|
3837
3950
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3838
3951
|
}
|
3839
3952
|
} break;
|
@@ -3858,10 +3971,28 @@ static void llm_load_hparams(
|
|
3858
3971
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3859
3972
|
}
|
3860
3973
|
} break;
|
3974
|
+
case LLM_ARCH_QWEN2MOE:
|
3975
|
+
{
|
3976
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3977
|
+
switch (hparams.n_layer) {
|
3978
|
+
case 24: model.type = e_model::MODEL_A2_7B; break;
|
3979
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3980
|
+
}
|
3981
|
+
} break;
|
3861
3982
|
case LLM_ARCH_PHI2:
|
3862
3983
|
{
|
3863
3984
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3864
3985
|
|
3986
|
+
switch (hparams.n_layer) {
|
3987
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
3988
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
3989
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3990
|
+
}
|
3991
|
+
} break;
|
3992
|
+
case LLM_ARCH_PHI3:
|
3993
|
+
{
|
3994
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3995
|
+
|
3865
3996
|
switch (hparams.n_layer) {
|
3866
3997
|
case 24: model.type = e_model::MODEL_1B; break;
|
3867
3998
|
case 32: model.type = e_model::MODEL_3B; break;
|
@@ -3983,6 +4114,28 @@ static void llm_load_hparams(
|
|
3983
4114
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3984
4115
|
}
|
3985
4116
|
} break;
|
4117
|
+
case LLM_ARCH_DBRX:
|
4118
|
+
{
|
4119
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4120
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
4121
|
+
|
4122
|
+
switch (hparams.n_layer) {
|
4123
|
+
case 40: model.type = e_model::MODEL_16x12B; break;
|
4124
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4125
|
+
}
|
4126
|
+
} break;
|
4127
|
+
case LLM_ARCH_OLMO:
|
4128
|
+
{
|
4129
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4130
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
4131
|
+
|
4132
|
+
switch (hparams.n_layer) {
|
4133
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
4134
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
4135
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
4136
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4137
|
+
}
|
4138
|
+
} break;
|
3986
4139
|
default: (void)0;
|
3987
4140
|
}
|
3988
4141
|
|
@@ -4042,6 +4195,35 @@ static void llm_load_vocab(
|
|
4042
4195
|
vocab.special_cls_id = -1;
|
4043
4196
|
vocab.special_mask_id = -1;
|
4044
4197
|
|
4198
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4199
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4200
|
+
// will allow those models to continue to work. The general names
|
4201
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4202
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4203
|
+
// new versions of these models have been published.
|
4204
|
+
std::string gen_name;
|
4205
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4206
|
+
|
4207
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4208
|
+
[](unsigned char c){ return std::tolower(c); });
|
4209
|
+
|
4210
|
+
if (gen_name.find("code") != std::string::npos) {
|
4211
|
+
if (model.arch == LLM_ARCH_LLAMA) {
|
4212
|
+
vocab.special_prefix_id = 32007;
|
4213
|
+
vocab.special_suffix_id = 32008;
|
4214
|
+
vocab.special_middle_id = 32009;
|
4215
|
+
vocab.special_eot_id = 32010;
|
4216
|
+
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4217
|
+
vocab.special_prefix_id = 67;
|
4218
|
+
vocab.special_suffix_id = 69;
|
4219
|
+
vocab.special_middle_id = 68;
|
4220
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4221
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4222
|
+
//vocab.special_eot_id = 70;
|
4223
|
+
vocab.special_eot_id = 107;
|
4224
|
+
}
|
4225
|
+
}
|
4226
|
+
|
4045
4227
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4046
4228
|
if (add_space_prefix_keyidx != -1) {
|
4047
4229
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
@@ -4155,14 +4337,19 @@ static void llm_load_vocab(
|
|
4155
4337
|
// special tokens
|
4156
4338
|
{
|
4157
4339
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4158
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4159
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4160
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4161
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4162
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4163
|
-
{ LLM_KV_TOKENIZER_CLS_ID,
|
4164
|
-
{ LLM_KV_TOKENIZER_MASK_ID,
|
4340
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4341
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4342
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4343
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4344
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4345
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4346
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4347
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
4348
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
4349
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4350
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4165
4351
|
};
|
4352
|
+
|
4166
4353
|
for (const auto & it : special_token_types) {
|
4167
4354
|
const std::string & key = kv(std::get<0>(it));
|
4168
4355
|
int32_t & id = std::get<1>(it);
|
@@ -4177,7 +4364,6 @@ static void llm_load_vocab(
|
|
4177
4364
|
} else {
|
4178
4365
|
id = new_id;
|
4179
4366
|
}
|
4180
|
-
|
4181
4367
|
}
|
4182
4368
|
|
4183
4369
|
// Handle add_bos_token and add_eos_token
|
@@ -4191,6 +4377,28 @@ static void llm_load_vocab(
|
|
4191
4377
|
vocab.special_add_eos = int(temp);
|
4192
4378
|
}
|
4193
4379
|
}
|
4380
|
+
|
4381
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
4382
|
+
//
|
4383
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
4384
|
+
// for now, we apply this workaround to find the EOT token based on its text
|
4385
|
+
if (vocab.special_eot_id == -1) {
|
4386
|
+
for (const auto & t : vocab.token_to_id) {
|
4387
|
+
if (
|
4388
|
+
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
4389
|
+
// need to fix convert script
|
4390
|
+
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
4391
|
+
(t.first == "<|eot_id|>" ||
|
4392
|
+
t.first == "<|im_end|>" ||
|
4393
|
+
t.first == "<|end|>" ||
|
4394
|
+
t.first == "<end_of_turn>"
|
4395
|
+
)
|
4396
|
+
) {
|
4397
|
+
vocab.special_eot_id = t.second;
|
4398
|
+
break;
|
4399
|
+
}
|
4400
|
+
}
|
4401
|
+
}
|
4194
4402
|
}
|
4195
4403
|
|
4196
4404
|
// build special tokens cache
|
@@ -4353,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4353
4561
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4354
4562
|
|
4355
4563
|
// special tokens
|
4356
|
-
if (vocab.special_bos_id
|
4357
|
-
if (vocab.special_eos_id
|
4358
|
-
if (vocab.special_unk_id
|
4359
|
-
if (vocab.special_sep_id
|
4360
|
-
if (vocab.special_pad_id
|
4361
|
-
if (vocab.special_cls_id
|
4362
|
-
if (vocab.special_mask_id
|
4363
|
-
|
4564
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4565
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4566
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4567
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4568
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4569
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4570
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4571
|
+
|
4572
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4573
|
+
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
4574
|
+
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4575
|
+
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4576
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4364
4577
|
}
|
4365
4578
|
|
4366
4579
|
// Returns false if cancelled by progress_callback
|
@@ -4378,6 +4591,13 @@ static bool llm_load_tensors(
|
|
4378
4591
|
|
4379
4592
|
auto & hparams = model.hparams;
|
4380
4593
|
|
4594
|
+
#ifdef GGML_USE_SYCL
|
4595
|
+
// disable MoE with SYCL until mul_mat_id is updated
|
4596
|
+
if (hparams.n_expert > 0) {
|
4597
|
+
n_gpu_layers = 0;
|
4598
|
+
}
|
4599
|
+
#endif
|
4600
|
+
|
4381
4601
|
model.split_mode = split_mode;
|
4382
4602
|
model.main_gpu = main_gpu;
|
4383
4603
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -4475,7 +4695,7 @@ static bool llm_load_tensors(
|
|
4475
4695
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4476
4696
|
|
4477
4697
|
// for moe merged tensors
|
4478
|
-
ctx_size += ggml_tensor_overhead()*
|
4698
|
+
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
4479
4699
|
|
4480
4700
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4481
4701
|
for (auto & it : buft_layer_count) {
|
@@ -4671,6 +4891,39 @@ static bool llm_load_tensors(
|
|
4671
4891
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4672
4892
|
}
|
4673
4893
|
} break;
|
4894
|
+
case LLM_ARCH_DBRX:
|
4895
|
+
{
|
4896
|
+
if (n_expert == 0) {
|
4897
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
4898
|
+
}
|
4899
|
+
|
4900
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4901
|
+
|
4902
|
+
// output
|
4903
|
+
{
|
4904
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4905
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4906
|
+
}
|
4907
|
+
|
4908
|
+
for (int i = 0; i < n_layer; ++i) {
|
4909
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4910
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4911
|
+
|
4912
|
+
auto & layer = model.layers[i];
|
4913
|
+
|
4914
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4915
|
+
|
4916
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4917
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4918
|
+
|
4919
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4920
|
+
|
4921
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4922
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4923
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
4924
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4925
|
+
}
|
4926
|
+
} break;
|
4674
4927
|
case LLM_ARCH_BAICHUAN:
|
4675
4928
|
{
|
4676
4929
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -4985,8 +5238,13 @@ static bool llm_load_tensors(
|
|
4985
5238
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
4986
5239
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
4987
5240
|
|
4988
|
-
|
4989
|
-
layer.
|
5241
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
5242
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
5243
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
5244
|
+
|
5245
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5246
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
5247
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4990
5248
|
|
4991
5249
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4992
5250
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5029,7 +5287,13 @@ static bool llm_load_tensors(
|
|
5029
5287
|
// output
|
5030
5288
|
{
|
5031
5289
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5032
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5290
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5291
|
+
// if output is NULL, init from the input tok embed
|
5292
|
+
if (model.output == NULL) {
|
5293
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5294
|
+
ml.n_created--; // artificial tensor
|
5295
|
+
ml.size_data += ggml_nbytes(model.output);
|
5296
|
+
}
|
5033
5297
|
}
|
5034
5298
|
|
5035
5299
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5057,6 +5321,54 @@ static bool llm_load_tensors(
|
|
5057
5321
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5058
5322
|
}
|
5059
5323
|
} break;
|
5324
|
+
case LLM_ARCH_QWEN2MOE:
|
5325
|
+
{
|
5326
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5327
|
+
|
5328
|
+
// output
|
5329
|
+
{
|
5330
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5331
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5332
|
+
}
|
5333
|
+
|
5334
|
+
for (int i = 0; i < n_layer; ++i) {
|
5335
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5336
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5337
|
+
|
5338
|
+
auto & layer = model.layers[i];
|
5339
|
+
|
5340
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5341
|
+
|
5342
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5343
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5344
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5345
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5346
|
+
|
5347
|
+
// optional bias tensors
|
5348
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5349
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5350
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5351
|
+
|
5352
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5353
|
+
|
5354
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5355
|
+
|
5356
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
5357
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
5358
|
+
|
5359
|
+
// MoE branch
|
5360
|
+
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
5361
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5362
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5363
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5364
|
+
|
5365
|
+
// Shared expert branch
|
5366
|
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5367
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
5368
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
5369
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
5370
|
+
}
|
5371
|
+
} break;
|
5060
5372
|
case LLM_ARCH_PHI2:
|
5061
5373
|
{
|
5062
5374
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5102,6 +5414,33 @@ static bool llm_load_tensors(
|
|
5102
5414
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5103
5415
|
}
|
5104
5416
|
} break;
|
5417
|
+
case LLM_ARCH_PHI3:
|
5418
|
+
{
|
5419
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
5420
|
+
|
5421
|
+
// output
|
5422
|
+
{
|
5423
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
5424
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
5425
|
+
}
|
5426
|
+
|
5427
|
+
for (int i = 0; i < n_layer; ++i) {
|
5428
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
5429
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5430
|
+
|
5431
|
+
auto& layer = model.layers[i];
|
5432
|
+
|
5433
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5434
|
+
|
5435
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
5436
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5437
|
+
|
5438
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5439
|
+
|
5440
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5441
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5442
|
+
}
|
5443
|
+
} break;
|
5105
5444
|
case LLM_ARCH_PLAMO:
|
5106
5445
|
{
|
5107
5446
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5450,6 +5789,37 @@ static bool llm_load_tensors(
|
|
5450
5789
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5451
5790
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5452
5791
|
|
5792
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5793
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5794
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5795
|
+
}
|
5796
|
+
} break;
|
5797
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
5798
|
+
{
|
5799
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5800
|
+
|
5801
|
+
// output
|
5802
|
+
{
|
5803
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5804
|
+
// if output is NULL, init from the input tok embed
|
5805
|
+
if (model.output == NULL) {
|
5806
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5807
|
+
ml.n_created--; // artificial tensor
|
5808
|
+
ml.size_data += ggml_nbytes(model.output);
|
5809
|
+
}
|
5810
|
+
}
|
5811
|
+
|
5812
|
+
for (int i = 0; i < n_layer; ++i) {
|
5813
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5814
|
+
|
5815
|
+
auto & layer = model.layers[i];
|
5816
|
+
|
5817
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5818
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5819
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5820
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5821
|
+
|
5822
|
+
|
5453
5823
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5454
5824
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5455
5825
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
@@ -5890,6 +6260,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
5890
6260
|
return cur;
|
5891
6261
|
}
|
5892
6262
|
|
6263
|
+
static struct ggml_tensor * llm_build_moe_ffn(
|
6264
|
+
struct ggml_context * ctx,
|
6265
|
+
struct ggml_tensor * cur,
|
6266
|
+
struct ggml_tensor * gate_inp,
|
6267
|
+
struct ggml_tensor * up_exps,
|
6268
|
+
struct ggml_tensor * gate_exps,
|
6269
|
+
struct ggml_tensor * down_exps,
|
6270
|
+
int64_t n_expert,
|
6271
|
+
int64_t n_expert_used,
|
6272
|
+
llm_ffn_op_type type_op,
|
6273
|
+
bool norm_w,
|
6274
|
+
const llm_build_cb & cb,
|
6275
|
+
int il) {
|
6276
|
+
int64_t n_embd = cur->ne[0];
|
6277
|
+
int64_t n_tokens = cur->ne[1];
|
6278
|
+
|
6279
|
+
ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
6280
|
+
cb(logits, "ffn_moe_logits", il);
|
6281
|
+
|
6282
|
+
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
6283
|
+
cb(probs, "ffn_moe_probs", il);
|
6284
|
+
|
6285
|
+
// select experts
|
6286
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
6287
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6288
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
6289
|
+
|
6290
|
+
ggml_tensor * weights = ggml_get_rows(ctx,
|
6291
|
+
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
6292
|
+
cb(weights, "ffn_moe_weights", il);
|
6293
|
+
|
6294
|
+
if (norm_w) {
|
6295
|
+
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
6296
|
+
|
6297
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
6298
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6299
|
+
|
6300
|
+
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
6301
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6302
|
+
|
6303
|
+
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6304
|
+
}
|
6305
|
+
|
6306
|
+
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6307
|
+
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6308
|
+
cb(up, "ffn_moe_up", il);
|
6309
|
+
|
6310
|
+
ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6311
|
+
cb(gate, "ffn_moe_gate", il);
|
6312
|
+
|
6313
|
+
switch (type_op) {
|
6314
|
+
case LLM_FFN_SILU:
|
6315
|
+
{
|
6316
|
+
gate = ggml_silu(ctx, gate);
|
6317
|
+
cb(gate, "ffn_moe_silu", il);
|
6318
|
+
} break;
|
6319
|
+
case LLM_FFN_GELU:
|
6320
|
+
{
|
6321
|
+
gate = ggml_gelu(ctx, gate);
|
6322
|
+
cb(gate, "ffn_moe_gelu", il);
|
6323
|
+
} break;
|
6324
|
+
default:
|
6325
|
+
GGML_ASSERT(false);
|
6326
|
+
}
|
6327
|
+
|
6328
|
+
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
6329
|
+
cb(par, "ffn_moe_gate_par", il);
|
6330
|
+
|
6331
|
+
ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
6332
|
+
cb(experts, "ffn_moe_down", il);
|
6333
|
+
|
6334
|
+
experts = ggml_mul(ctx, experts, weights);
|
6335
|
+
|
6336
|
+
// aggregate experts
|
6337
|
+
ggml_tensor * moe_out = nullptr;
|
6338
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6339
|
+
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
6340
|
+
experts->nb[2], i*experts->nb[1]);
|
6341
|
+
|
6342
|
+
if (i == 0) {
|
6343
|
+
moe_out = cur_expert;
|
6344
|
+
} else {
|
6345
|
+
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
6346
|
+
}
|
6347
|
+
}
|
6348
|
+
|
6349
|
+
if (n_expert_used == 1) {
|
6350
|
+
// avoid returning a non-contiguous tensor
|
6351
|
+
moe_out = ggml_cont(ctx, moe_out);
|
6352
|
+
}
|
6353
|
+
|
6354
|
+
return moe_out;
|
6355
|
+
}
|
6356
|
+
|
5893
6357
|
// if max_alibi_bias > 0 then apply ALiBi
|
5894
6358
|
static struct ggml_tensor * llm_build_kqv(
|
5895
6359
|
struct ggml_context * ctx,
|
@@ -5928,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5928
6392
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
5929
6393
|
cb(kq, "kq", il);
|
5930
6394
|
|
5931
|
-
if (model.arch == LLM_ARCH_PHI2) {
|
6395
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
5932
6396
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
5933
6397
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
5934
6398
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6433,63 +6897,16 @@ struct llm_build_context {
|
|
6433
6897
|
LLM_NORM_RMS, cb, il);
|
6434
6898
|
cb(cur, "ffn_norm", il);
|
6435
6899
|
|
6436
|
-
|
6437
|
-
|
6438
|
-
|
6439
|
-
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
cb(
|
6445
|
-
|
6446
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6447
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6448
|
-
cb(weights, "ffn_moe_weights", il);
|
6449
|
-
|
6450
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6451
|
-
|
6452
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6453
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6454
|
-
|
6455
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6456
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6457
|
-
|
6458
|
-
// compute expert outputs
|
6459
|
-
ggml_tensor * moe_out = nullptr;
|
6460
|
-
|
6461
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6462
|
-
ggml_tensor * cur_expert;
|
6463
|
-
|
6464
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6465
|
-
cb(cur_up, "ffn_moe_up", il);
|
6466
|
-
|
6467
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6468
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6469
|
-
|
6470
|
-
cur_gate = ggml_silu(ctx0, cur_gate);
|
6471
|
-
cb(cur_gate, "ffn_moe_silu", il);
|
6472
|
-
|
6473
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6474
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6475
|
-
|
6476
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6477
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6478
|
-
|
6479
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6480
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6481
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6482
|
-
|
6483
|
-
if (i == 0) {
|
6484
|
-
moe_out = cur_expert;
|
6485
|
-
} else {
|
6486
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6487
|
-
cb(moe_out, "ffn_moe_out", il);
|
6488
|
-
}
|
6489
|
-
}
|
6490
|
-
|
6491
|
-
cur = moe_out;
|
6492
|
-
}
|
6900
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
6901
|
+
model.layers[il].ffn_gate_inp,
|
6902
|
+
model.layers[il].ffn_up_exps,
|
6903
|
+
model.layers[il].ffn_gate_exps,
|
6904
|
+
model.layers[il].ffn_down_exps,
|
6905
|
+
n_expert, n_expert_used,
|
6906
|
+
LLM_FFN_SILU, true,
|
6907
|
+
cb, il);
|
6908
|
+
cb(cur, "ffn_moe_out", il);
|
6909
|
+
}
|
6493
6910
|
|
6494
6911
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
6495
6912
|
cb(cur, "ffn_out", il);
|
@@ -6967,74 +7384,158 @@ struct llm_build_context {
|
|
6967
7384
|
LLM_NORM_RMS, cb, il);
|
6968
7385
|
cb(cur, "ffn_norm", il);
|
6969
7386
|
|
6970
|
-
|
6971
|
-
|
7387
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7388
|
+
model.layers[il].ffn_gate_inp,
|
7389
|
+
model.layers[il].ffn_up_exps,
|
7390
|
+
model.layers[il].ffn_gate_exps,
|
7391
|
+
model.layers[il].ffn_down_exps,
|
7392
|
+
n_expert, n_expert_used,
|
7393
|
+
LLM_FFN_GELU, true,
|
7394
|
+
cb, il);
|
7395
|
+
cb(cur, "ffn_moe_out", il);
|
7396
|
+
|
7397
|
+
// Grok
|
7398
|
+
// if layer_out_norm is present then apply it before adding the input
|
7399
|
+
// Idea: maybe ffn_out_norm is a better name
|
7400
|
+
if (model.layers[il].layer_out_norm) {
|
7401
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7402
|
+
model.layers[il].layer_out_norm, NULL,
|
7403
|
+
LLM_NORM_RMS, cb, il);
|
7404
|
+
cb(cur, "layer_out_norm", il);
|
7405
|
+
}
|
7406
|
+
|
7407
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7408
|
+
cb(cur, "ffn_out", il);
|
7409
|
+
|
7410
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7411
|
+
if (layer_dir != nullptr) {
|
7412
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7413
|
+
}
|
7414
|
+
cb(cur, "l_out", il);
|
7415
|
+
|
7416
|
+
// input for next layer
|
7417
|
+
inpL = cur;
|
7418
|
+
}
|
7419
|
+
|
7420
|
+
cur = inpL;
|
7421
|
+
|
7422
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7423
|
+
model.output_norm, NULL,
|
7424
|
+
LLM_NORM_RMS, cb, -1);
|
7425
|
+
cb(cur, "result_norm", -1);
|
7426
|
+
|
7427
|
+
// lm_head
|
7428
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7429
|
+
|
7430
|
+
// Grok
|
7431
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
6972
7432
|
|
6973
|
-
|
6974
|
-
cb(probs, "ffn_moe_probs", il);
|
7433
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
6975
7434
|
|
6976
|
-
|
6977
|
-
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6978
|
-
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
7435
|
+
cb(cur, "result_output", -1);
|
6979
7436
|
|
6980
|
-
|
6981
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6982
|
-
cb(weights, "ffn_moe_weights", il);
|
7437
|
+
ggml_build_forward_expand(gf, cur);
|
6983
7438
|
|
6984
|
-
|
7439
|
+
return gf;
|
7440
|
+
}
|
7441
|
+
|
7442
|
+
struct ggml_cgraph * build_dbrx() {
|
7443
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6985
7444
|
|
6986
|
-
|
6987
|
-
|
7445
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7446
|
+
int32_t n_tokens = this->n_tokens;
|
6988
7447
|
|
6989
|
-
|
6990
|
-
|
7448
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7449
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7450
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7451
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6991
7452
|
|
6992
|
-
|
6993
|
-
|
7453
|
+
struct ggml_tensor * cur;
|
7454
|
+
struct ggml_tensor * inpL;
|
6994
7455
|
|
6995
|
-
|
6996
|
-
ggml_tensor * cur_expert;
|
7456
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6997
7457
|
|
6998
|
-
|
6999
|
-
|
7458
|
+
// inp_pos - contains the positions
|
7459
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7000
7460
|
|
7001
|
-
|
7002
|
-
|
7461
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7462
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7003
7463
|
|
7004
|
-
|
7005
|
-
|
7006
|
-
cb(cur_gate, "ffn_moe_gelu", il);
|
7464
|
+
for (int il = 0; il < n_layer; ++il) {
|
7465
|
+
struct ggml_tensor * inpSA = inpL;
|
7007
7466
|
|
7008
|
-
|
7009
|
-
|
7467
|
+
// norm
|
7468
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7469
|
+
model.layers[il].attn_norm, NULL,
|
7470
|
+
LLM_NORM, cb, il);
|
7471
|
+
cb(cur, "attn_norm", il);
|
7010
7472
|
|
7011
|
-
|
7012
|
-
|
7473
|
+
// self-attention
|
7474
|
+
{
|
7475
|
+
struct ggml_tensor * Qcur = nullptr;
|
7476
|
+
struct ggml_tensor * Kcur = nullptr;
|
7477
|
+
struct ggml_tensor * Vcur = nullptr;
|
7013
7478
|
|
7014
|
-
|
7015
|
-
|
7016
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
7479
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7480
|
+
cb(cur, "wqkv", il);
|
7017
7481
|
|
7018
|
-
|
7019
|
-
|
7020
|
-
} else {
|
7021
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
7022
|
-
cb(moe_out, "ffn_moe_out", il);
|
7023
|
-
}
|
7024
|
-
}
|
7482
|
+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
7483
|
+
cb(cur, "wqkv_clamped", il);
|
7025
7484
|
|
7026
|
-
|
7485
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7486
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7487
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7027
7488
|
|
7028
|
-
|
7029
|
-
|
7030
|
-
|
7031
|
-
|
7032
|
-
|
7033
|
-
|
7034
|
-
|
7035
|
-
|
7489
|
+
cb(Qcur, "Qcur", il);
|
7490
|
+
cb(Kcur, "Kcur", il);
|
7491
|
+
cb(Vcur, "Vcur", il);
|
7492
|
+
|
7493
|
+
Qcur = ggml_rope_custom(
|
7494
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7495
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7496
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7497
|
+
);
|
7498
|
+
cb(Qcur, "Qcur", il);
|
7499
|
+
|
7500
|
+
Kcur = ggml_rope_custom(
|
7501
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7502
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7503
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7504
|
+
);
|
7505
|
+
cb(Kcur, "Kcur", il);
|
7506
|
+
|
7507
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
+
model.layers[il].wo, NULL,
|
7509
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7036
7510
|
}
|
7037
7511
|
|
7512
|
+
if (il == n_layer - 1) {
|
7513
|
+
// skip computing output for unused tokens
|
7514
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7515
|
+
n_tokens = n_outputs;
|
7516
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7517
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7518
|
+
}
|
7519
|
+
|
7520
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7521
|
+
cb(ffn_inp, "ffn_inp", il);
|
7522
|
+
|
7523
|
+
// feed-forward network
|
7524
|
+
// MoE branch
|
7525
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7526
|
+
model.layers[il].attn_out_norm, NULL,
|
7527
|
+
LLM_NORM, cb, il);
|
7528
|
+
cb(cur, "attn_out_norm", il);
|
7529
|
+
|
7530
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7531
|
+
model.layers[il].ffn_gate_inp,
|
7532
|
+
model.layers[il].ffn_up_exps,
|
7533
|
+
model.layers[il].ffn_gate_exps,
|
7534
|
+
model.layers[il].ffn_down_exps,
|
7535
|
+
n_expert, n_expert_used,
|
7536
|
+
LLM_FFN_SILU, true,
|
7537
|
+
cb, il);
|
7538
|
+
cb(cur, "ffn_moe_out", il);
|
7038
7539
|
|
7039
7540
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
7040
7541
|
cb(cur, "ffn_out", il);
|
@@ -7052,18 +7553,13 @@ struct llm_build_context {
|
|
7052
7553
|
cur = inpL;
|
7053
7554
|
|
7054
7555
|
cur = llm_build_norm(ctx0, cur, hparams,
|
7055
|
-
|
7056
|
-
|
7556
|
+
model.output_norm, NULL,
|
7557
|
+
LLM_NORM, cb, -1);
|
7057
7558
|
cb(cur, "result_norm", -1);
|
7058
7559
|
|
7059
7560
|
// lm_head
|
7060
7561
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7061
7562
|
|
7062
|
-
// Grok
|
7063
|
-
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7064
|
-
|
7065
|
-
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7066
|
-
|
7067
7563
|
cb(cur, "result_output", -1);
|
7068
7564
|
|
7069
7565
|
ggml_build_forward_expand(gf, cur);
|
@@ -7923,7 +8419,7 @@ struct llm_build_context {
|
|
7923
8419
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7924
8420
|
|
7925
8421
|
for (int il = 0; il < n_layer; ++il) {
|
7926
|
-
|
8422
|
+
|
7927
8423
|
|
7928
8424
|
// norm
|
7929
8425
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7932,6 +8428,8 @@ struct llm_build_context {
|
|
7932
8428
|
LLM_NORM, cb, il);
|
7933
8429
|
cb(cur, "attn_norm", il);
|
7934
8430
|
|
8431
|
+
struct ggml_tensor * inpSA = cur;
|
8432
|
+
|
7935
8433
|
// self-attention
|
7936
8434
|
{
|
7937
8435
|
// compute Q and K and RoPE them
|
@@ -7956,15 +8454,36 @@ struct llm_build_context {
|
|
7956
8454
|
cb(Vcur, "Vcur", il);
|
7957
8455
|
}
|
7958
8456
|
|
8457
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8458
|
+
cb(Qcur, "Qcur", il);
|
8459
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8460
|
+
cb(Kcur, "Kcur", il);
|
8461
|
+
|
8462
|
+
if (model.layers[il].attn_q_norm) {
|
8463
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8464
|
+
model.layers[il].attn_q_norm,
|
8465
|
+
NULL,
|
8466
|
+
LLM_NORM, cb, il);
|
8467
|
+
cb(Qcur, "Qcur", il);
|
8468
|
+
}
|
8469
|
+
if (model.layers[il].attn_k_norm) {
|
8470
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8471
|
+
model.layers[il].attn_k_norm,
|
8472
|
+
NULL,
|
8473
|
+
LLM_NORM, cb, il);
|
8474
|
+
cb(Kcur, "Kcur", il);
|
8475
|
+
}
|
8476
|
+
|
8477
|
+
|
7959
8478
|
Qcur = ggml_rope_custom(
|
7960
|
-
ctx0,
|
8479
|
+
ctx0, Qcur, inp_pos,
|
7961
8480
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7962
8481
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7963
8482
|
);
|
7964
8483
|
cb(Qcur, "Qcur", il);
|
7965
8484
|
|
7966
8485
|
Kcur = ggml_rope_custom(
|
7967
|
-
ctx0,
|
8486
|
+
ctx0, Kcur, inp_pos,
|
7968
8487
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7969
8488
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7970
8489
|
);
|
@@ -7979,20 +8498,25 @@ struct llm_build_context {
|
|
7979
8498
|
// skip computing output for unused tokens
|
7980
8499
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7981
8500
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8501
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7982
8502
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7983
8503
|
}
|
7984
8504
|
|
7985
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur,
|
8505
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7986
8506
|
cb(ffn_inp, "ffn_inp", il);
|
7987
8507
|
|
7988
8508
|
// feed-forward network
|
7989
8509
|
{
|
7990
|
-
|
7991
|
-
|
7992
|
-
|
7993
|
-
|
7994
|
-
|
7995
|
-
|
8510
|
+
if (model.layers[il].ffn_norm) {
|
8511
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8512
|
+
model.layers[il].ffn_norm,
|
8513
|
+
model.layers[il].ffn_norm_b,
|
8514
|
+
LLM_NORM, cb, il);
|
8515
|
+
cb(cur, "ffn_norm", il);
|
8516
|
+
} else {
|
8517
|
+
// parallel residual
|
8518
|
+
cur = inpSA;
|
8519
|
+
}
|
7996
8520
|
cur = llm_build_ffn(ctx0, cur,
|
7997
8521
|
model.layers[il].ffn_up, NULL,
|
7998
8522
|
model.layers[il].ffn_gate, NULL,
|
@@ -8182,12 +8706,6 @@ struct llm_build_context {
|
|
8182
8706
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8183
8707
|
cb(Vcur, "Vcur", il);
|
8184
8708
|
|
8185
|
-
// these nodes are added to the graph together so that they are not reordered
|
8186
|
-
// by doing so, the number of splits in the graph is reduced
|
8187
|
-
ggml_build_forward_expand(gf, Qcur);
|
8188
|
-
ggml_build_forward_expand(gf, Kcur);
|
8189
|
-
ggml_build_forward_expand(gf, Vcur);
|
8190
|
-
|
8191
8709
|
Qcur = ggml_rope_custom(
|
8192
8710
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8193
8711
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8245,25 +8763,288 @@ struct llm_build_context {
|
|
8245
8763
|
LLM_NORM_RMS, cb, -1);
|
8246
8764
|
cb(cur, "result_norm", -1);
|
8247
8765
|
|
8248
|
-
// lm_head
|
8766
|
+
// lm_head
|
8767
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8768
|
+
cb(cur, "result_output", -1);
|
8769
|
+
|
8770
|
+
ggml_build_forward_expand(gf, cur);
|
8771
|
+
|
8772
|
+
return gf;
|
8773
|
+
}
|
8774
|
+
|
8775
|
+
struct ggml_cgraph * build_qwen2moe() {
|
8776
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8777
|
+
|
8778
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8779
|
+
int32_t n_tokens = this->n_tokens;
|
8780
|
+
|
8781
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8782
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8783
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8784
|
+
|
8785
|
+
struct ggml_tensor * cur;
|
8786
|
+
struct ggml_tensor * inpL;
|
8787
|
+
|
8788
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8789
|
+
|
8790
|
+
// inp_pos - contains the positions
|
8791
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8792
|
+
|
8793
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8794
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8795
|
+
|
8796
|
+
for (int il = 0; il < n_layer; ++il) {
|
8797
|
+
struct ggml_tensor * inpSA = inpL;
|
8798
|
+
|
8799
|
+
// norm
|
8800
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8801
|
+
model.layers[il].attn_norm, NULL,
|
8802
|
+
LLM_NORM_RMS, cb, il);
|
8803
|
+
cb(cur, "attn_norm", il);
|
8804
|
+
|
8805
|
+
// self_attention
|
8806
|
+
{
|
8807
|
+
// compute Q and K and RoPE them
|
8808
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8809
|
+
cb(Qcur, "Qcur", il);
|
8810
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8811
|
+
cb(Qcur, "Qcur", il);
|
8812
|
+
|
8813
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8814
|
+
cb(Kcur, "Kcur", il);
|
8815
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8816
|
+
cb(Kcur, "Kcur", il);
|
8817
|
+
|
8818
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8819
|
+
cb(Vcur, "Vcur", il);
|
8820
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8821
|
+
cb(Vcur, "Vcur", il);
|
8822
|
+
|
8823
|
+
Qcur = ggml_rope_custom(
|
8824
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8825
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8826
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8827
|
+
);
|
8828
|
+
cb(Qcur, "Qcur", il);
|
8829
|
+
|
8830
|
+
Kcur = ggml_rope_custom(
|
8831
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8832
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8833
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8834
|
+
);
|
8835
|
+
cb(Kcur, "Kcur", il);
|
8836
|
+
|
8837
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8838
|
+
model.layers[il].wo, model.layers[il].bo,
|
8839
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8840
|
+
}
|
8841
|
+
|
8842
|
+
if (il == n_layer - 1) {
|
8843
|
+
// skip computing output for unused tokens
|
8844
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8845
|
+
n_tokens = n_outputs;
|
8846
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8847
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8848
|
+
}
|
8849
|
+
|
8850
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8851
|
+
cb(ffn_inp, "ffn_inp", il);
|
8852
|
+
|
8853
|
+
// MoE branch
|
8854
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8855
|
+
model.layers[il].ffn_norm, NULL,
|
8856
|
+
LLM_NORM_RMS, cb, il);
|
8857
|
+
cb(cur, "ffn_norm", il);
|
8858
|
+
|
8859
|
+
ggml_tensor * moe_out =
|
8860
|
+
llm_build_moe_ffn(ctx0, cur,
|
8861
|
+
model.layers[il].ffn_gate_inp,
|
8862
|
+
model.layers[il].ffn_up_exps,
|
8863
|
+
model.layers[il].ffn_gate_exps,
|
8864
|
+
model.layers[il].ffn_down_exps,
|
8865
|
+
n_expert, n_expert_used,
|
8866
|
+
LLM_FFN_SILU, false,
|
8867
|
+
cb, il);
|
8868
|
+
cb(cur, "ffn_moe_out", il);
|
8869
|
+
|
8870
|
+
// FFN shared expert
|
8871
|
+
{
|
8872
|
+
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
8873
|
+
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
8874
|
+
|
8875
|
+
// sigmoid
|
8876
|
+
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
8877
|
+
cb(cur_gate, "ffn_shexp_gate", il);
|
8878
|
+
|
8879
|
+
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
8880
|
+
model.layers[il].ffn_up_shexp, NULL,
|
8881
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
8882
|
+
model.layers[il].ffn_down_shexp, NULL,
|
8883
|
+
NULL,
|
8884
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
8885
|
+
cb(cur_ffn, "ffn_shexp", il);
|
8886
|
+
|
8887
|
+
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
8888
|
+
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
8889
|
+
|
8890
|
+
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
8891
|
+
cb(moe_out, "ffn_out", il);
|
8892
|
+
|
8893
|
+
cur = moe_out;
|
8894
|
+
}
|
8895
|
+
|
8896
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8897
|
+
cb(cur, "l_out", il);
|
8898
|
+
|
8899
|
+
// input for next layer
|
8900
|
+
inpL = cur;
|
8901
|
+
}
|
8902
|
+
|
8903
|
+
cur = inpL;
|
8904
|
+
|
8905
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8906
|
+
model.output_norm, NULL,
|
8907
|
+
LLM_NORM_RMS, cb, -1);
|
8908
|
+
cb(cur, "result_norm", -1);
|
8909
|
+
|
8910
|
+
// lm_head
|
8911
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8912
|
+
cb(cur, "result_output", -1);
|
8913
|
+
|
8914
|
+
ggml_build_forward_expand(gf, cur);
|
8915
|
+
|
8916
|
+
return gf;
|
8917
|
+
}
|
8918
|
+
|
8919
|
+
struct ggml_cgraph * build_phi2() {
|
8920
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8921
|
+
|
8922
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8923
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
8924
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8925
|
+
|
8926
|
+
struct ggml_tensor * cur;
|
8927
|
+
struct ggml_tensor * attn_norm_output;
|
8928
|
+
struct ggml_tensor * ffn_output;
|
8929
|
+
struct ggml_tensor * inpL;
|
8930
|
+
|
8931
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8932
|
+
|
8933
|
+
// inp_pos - contains the positions
|
8934
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8935
|
+
|
8936
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8937
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8938
|
+
|
8939
|
+
for (int il = 0; il < n_layer; ++il) {
|
8940
|
+
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
8941
|
+
model.layers[il].attn_norm,
|
8942
|
+
model.layers[il].attn_norm_b,
|
8943
|
+
LLM_NORM, cb, il);
|
8944
|
+
cb(attn_norm_output, "attn_norm", il);
|
8945
|
+
|
8946
|
+
// self-attention
|
8947
|
+
{
|
8948
|
+
struct ggml_tensor * Qcur = nullptr;
|
8949
|
+
struct ggml_tensor * Kcur = nullptr;
|
8950
|
+
struct ggml_tensor * Vcur = nullptr;
|
8951
|
+
|
8952
|
+
if (model.layers[il].wqkv) {
|
8953
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
8954
|
+
cb(cur, "wqkv", il);
|
8955
|
+
|
8956
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
8957
|
+
cb(cur, "bqkv", il);
|
8958
|
+
|
8959
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
8960
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
8961
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
8962
|
+
} else {
|
8963
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
8964
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
8965
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
8966
|
+
}
|
8967
|
+
|
8968
|
+
cb(Qcur, "Qcur", il);
|
8969
|
+
cb(Kcur, "Kcur", il);
|
8970
|
+
cb(Vcur, "Vcur", il);
|
8971
|
+
|
8972
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8973
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8974
|
+
|
8975
|
+
Qcur = ggml_rope_custom(
|
8976
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8977
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8978
|
+
);
|
8979
|
+
cb(Qcur, "Qcur", il);
|
8980
|
+
|
8981
|
+
// with phi2, we scale the Q to avoid precision issues
|
8982
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
8983
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
8984
|
+
cb(Qcur, "Qcur", il);
|
8985
|
+
|
8986
|
+
Kcur = ggml_rope_custom(
|
8987
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8988
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8989
|
+
);
|
8990
|
+
cb(Kcur, "Kcur", il);
|
8991
|
+
|
8992
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8993
|
+
model.layers[il].wo, model.layers[il].bo,
|
8994
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8995
|
+
}
|
8996
|
+
|
8997
|
+
if (il == n_layer - 1) {
|
8998
|
+
// skip computing output for unused tokens
|
8999
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9000
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9001
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9002
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
9003
|
+
}
|
9004
|
+
|
9005
|
+
// FF
|
9006
|
+
{
|
9007
|
+
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
9008
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
9009
|
+
NULL, NULL,
|
9010
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
9011
|
+
NULL,
|
9012
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
9013
|
+
cb(ffn_output, "ffn_out", il);
|
9014
|
+
}
|
9015
|
+
|
9016
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
9017
|
+
cb(cur, "l_out", il);
|
9018
|
+
|
9019
|
+
cur = ggml_add(ctx0, cur, inpL);
|
9020
|
+
cb(cur, "l_out", il);
|
9021
|
+
|
9022
|
+
inpL = cur;
|
9023
|
+
}
|
9024
|
+
|
9025
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9026
|
+
model.output_norm,
|
9027
|
+
model.output_norm_b,
|
9028
|
+
LLM_NORM, cb, -1);
|
9029
|
+
cb(cur, "result_norm", -1);
|
9030
|
+
|
8249
9031
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8250
|
-
cb(cur, "
|
9032
|
+
cb(cur, "result_output_no_bias", -1);
|
8251
9033
|
|
9034
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
9035
|
+
cb(cur, "result_output", -1);
|
8252
9036
|
ggml_build_forward_expand(gf, cur);
|
8253
|
-
|
8254
9037
|
return gf;
|
8255
9038
|
}
|
8256
9039
|
|
8257
|
-
struct ggml_cgraph *
|
9040
|
+
struct ggml_cgraph * build_phi3() {
|
8258
9041
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8259
9042
|
|
8260
9043
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8261
|
-
const int64_t n_embd_gqa
|
9044
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
8262
9045
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8263
9046
|
|
8264
9047
|
struct ggml_tensor * cur;
|
8265
|
-
struct ggml_tensor * attn_norm_output;
|
8266
|
-
struct ggml_tensor * ffn_output;
|
8267
9048
|
struct ggml_tensor * inpL;
|
8268
9049
|
|
8269
9050
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -8275,14 +9056,16 @@ struct llm_build_context {
|
|
8275
9056
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8276
9057
|
|
8277
9058
|
for (int il = 0; il < n_layer; ++il) {
|
8278
|
-
|
8279
|
-
model.layers[il].attn_norm,
|
8280
|
-
model.layers[il].attn_norm_b,
|
8281
|
-
LLM_NORM, cb, il);
|
8282
|
-
cb(attn_norm_output, "attn_norm", il);
|
9059
|
+
auto residual = inpL;
|
8283
9060
|
|
8284
9061
|
// self-attention
|
8285
9062
|
{
|
9063
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9064
|
+
model.layers[il].attn_norm,
|
9065
|
+
NULL,
|
9066
|
+
LLM_NORM_RMS, cb, il);
|
9067
|
+
cb(attn_norm_output, "attn_norm", il);
|
9068
|
+
|
8286
9069
|
struct ggml_tensor * Qcur = nullptr;
|
8287
9070
|
struct ggml_tensor * Kcur = nullptr;
|
8288
9071
|
struct ggml_tensor * Vcur = nullptr;
|
@@ -8291,13 +9074,11 @@ struct llm_build_context {
|
|
8291
9074
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
8292
9075
|
cb(cur, "wqkv", il);
|
8293
9076
|
|
8294
|
-
|
8295
|
-
|
8296
|
-
|
8297
|
-
|
8298
|
-
|
8299
|
-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
8300
|
-
} else {
|
9077
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
9078
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
9079
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
9080
|
+
}
|
9081
|
+
else {
|
8301
9082
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
8302
9083
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
8303
9084
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
@@ -8316,9 +9097,7 @@ struct llm_build_context {
|
|
8316
9097
|
);
|
8317
9098
|
cb(Qcur, "Qcur", il);
|
8318
9099
|
|
8319
|
-
|
8320
|
-
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
8321
|
-
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9100
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
8322
9101
|
cb(Qcur, "Qcur", il);
|
8323
9102
|
|
8324
9103
|
Kcur = ggml_rope_custom(
|
@@ -8328,48 +9107,58 @@ struct llm_build_context {
|
|
8328
9107
|
cb(Kcur, "Kcur", il);
|
8329
9108
|
|
8330
9109
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8331
|
-
|
8332
|
-
|
9110
|
+
model.layers[il].wo, NULL,
|
9111
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8333
9112
|
}
|
8334
9113
|
|
8335
9114
|
if (il == n_layer - 1) {
|
8336
9115
|
// skip computing output for unused tokens
|
8337
|
-
struct ggml_tensor
|
8338
|
-
cur
|
8339
|
-
|
8340
|
-
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
9116
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
9117
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9118
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8341
9119
|
}
|
8342
9120
|
|
9121
|
+
cur = ggml_add(ctx0, cur, residual);
|
9122
|
+
residual = cur;
|
9123
|
+
|
9124
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9125
|
+
model.layers[il].ffn_norm, NULL,
|
9126
|
+
LLM_NORM_RMS, cb, il);
|
9127
|
+
cb(cur, "ffn_norm", il);
|
9128
|
+
|
8343
9129
|
// FF
|
9130
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
9131
|
+
// TOOD: support into llm_build_ffn
|
8344
9132
|
{
|
8345
|
-
|
8346
|
-
|
8347
|
-
NULL, NULL,
|
8348
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8349
|
-
NULL,
|
8350
|
-
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8351
|
-
cb(ffn_output, "ffn_out", il);
|
8352
|
-
}
|
9133
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
9134
|
+
cb(up, "ffn_up", il);
|
8353
9135
|
|
8354
|
-
|
8355
|
-
|
9136
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
9137
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
8356
9138
|
|
8357
|
-
|
9139
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
9140
|
+
cb(y, "ffn_gate", il);
|
9141
|
+
|
9142
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
9143
|
+
cb(down, "ffn_down", il);
|
9144
|
+
|
9145
|
+
cur = down;
|
9146
|
+
cb(cur, "ffn_out", il);
|
9147
|
+
}
|
9148
|
+
|
9149
|
+
cur = ggml_add(ctx0, residual, cur);
|
8358
9150
|
cb(cur, "l_out", il);
|
8359
9151
|
|
8360
9152
|
inpL = cur;
|
8361
9153
|
}
|
8362
9154
|
|
8363
9155
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
8364
|
-
|
8365
|
-
|
8366
|
-
|
9156
|
+
model.output_norm,
|
9157
|
+
NULL,
|
9158
|
+
LLM_NORM_RMS, cb, -1);
|
8367
9159
|
cb(cur, "result_norm", -1);
|
8368
9160
|
|
8369
9161
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8370
|
-
cb(cur, "result_output_no_bias", -1);
|
8371
|
-
|
8372
|
-
cur = ggml_add(ctx0, cur, model.output_b);
|
8373
9162
|
cb(cur, "result_output", -1);
|
8374
9163
|
|
8375
9164
|
ggml_build_forward_expand(gf, cur);
|
@@ -8377,6 +9166,7 @@ struct llm_build_context {
|
|
8377
9166
|
return gf;
|
8378
9167
|
}
|
8379
9168
|
|
9169
|
+
|
8380
9170
|
struct ggml_cgraph * build_plamo() {
|
8381
9171
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
8382
9172
|
|
@@ -9588,6 +10378,139 @@ struct llm_build_context {
|
|
9588
10378
|
return gf;
|
9589
10379
|
|
9590
10380
|
}
|
10381
|
+
|
10382
|
+
// ref: https://allenai.org/olmo
|
10383
|
+
// based on the original build_llama() function, changes:
|
10384
|
+
// * non-parametric layer norm
|
10385
|
+
// * clamp qkv
|
10386
|
+
// * removed bias
|
10387
|
+
// * removed MoE
|
10388
|
+
struct ggml_cgraph * build_olmo() {
|
10389
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10390
|
+
|
10391
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10392
|
+
int32_t n_tokens = this->n_tokens;
|
10393
|
+
|
10394
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10395
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10396
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10397
|
+
|
10398
|
+
struct ggml_tensor * cur;
|
10399
|
+
struct ggml_tensor * inpL;
|
10400
|
+
|
10401
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10402
|
+
|
10403
|
+
// inp_pos - contains the positions
|
10404
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10405
|
+
|
10406
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10407
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10408
|
+
|
10409
|
+
for (int il = 0; il < n_layer; ++il) {
|
10410
|
+
struct ggml_tensor * inpSA = inpL;
|
10411
|
+
|
10412
|
+
// norm
|
10413
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10414
|
+
NULL, NULL,
|
10415
|
+
LLM_NORM, cb, il);
|
10416
|
+
cb(cur, "attn_norm", il);
|
10417
|
+
|
10418
|
+
// self-attention
|
10419
|
+
{
|
10420
|
+
// compute Q and K and RoPE them
|
10421
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10422
|
+
cb(Qcur, "Qcur", il);
|
10423
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10424
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10425
|
+
cb(Qcur, "Qcur", il);
|
10426
|
+
}
|
10427
|
+
|
10428
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10429
|
+
cb(Kcur, "Kcur", il);
|
10430
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10431
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10432
|
+
cb(Kcur, "Kcur", il);
|
10433
|
+
}
|
10434
|
+
|
10435
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10436
|
+
cb(Vcur, "Vcur", il);
|
10437
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10438
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10439
|
+
cb(Vcur, "Vcur", il);
|
10440
|
+
}
|
10441
|
+
|
10442
|
+
Qcur = ggml_rope_custom(
|
10443
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10444
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10445
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10446
|
+
);
|
10447
|
+
cb(Qcur, "Qcur", il);
|
10448
|
+
|
10449
|
+
Kcur = ggml_rope_custom(
|
10450
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10451
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10452
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10453
|
+
);
|
10454
|
+
cb(Kcur, "Kcur", il);
|
10455
|
+
|
10456
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10457
|
+
model.layers[il].wo, nullptr,
|
10458
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10459
|
+
}
|
10460
|
+
|
10461
|
+
if (il == n_layer - 1) {
|
10462
|
+
// skip computing output for unused tokens
|
10463
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10464
|
+
n_tokens = n_outputs;
|
10465
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10466
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10467
|
+
}
|
10468
|
+
|
10469
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10470
|
+
cb(ffn_inp, "ffn_inp", il);
|
10471
|
+
|
10472
|
+
// feed-forward network
|
10473
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10474
|
+
NULL, NULL,
|
10475
|
+
LLM_NORM, cb, il);
|
10476
|
+
cb(cur, "ffn_norm", il);
|
10477
|
+
|
10478
|
+
cur = llm_build_ffn(ctx0, cur,
|
10479
|
+
model.layers[il].ffn_up, NULL,
|
10480
|
+
model.layers[il].ffn_gate, NULL,
|
10481
|
+
model.layers[il].ffn_down, NULL,
|
10482
|
+
NULL,
|
10483
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10484
|
+
cb(cur, "ffn_out", il);
|
10485
|
+
|
10486
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10487
|
+
cb(cur, "ffn_out", il);
|
10488
|
+
|
10489
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10490
|
+
if (layer_dir != nullptr) {
|
10491
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10492
|
+
}
|
10493
|
+
cb(cur, "l_out", il);
|
10494
|
+
|
10495
|
+
// input for next layer
|
10496
|
+
inpL = cur;
|
10497
|
+
}
|
10498
|
+
|
10499
|
+
cur = inpL;
|
10500
|
+
|
10501
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10502
|
+
NULL, NULL,
|
10503
|
+
LLM_NORM, cb, -1);
|
10504
|
+
cb(cur, "result_norm", -1);
|
10505
|
+
|
10506
|
+
// lm_head
|
10507
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10508
|
+
cb(cur, "result_output", -1);
|
10509
|
+
|
10510
|
+
ggml_build_forward_expand(gf, cur);
|
10511
|
+
|
10512
|
+
return gf;
|
10513
|
+
}
|
9591
10514
|
};
|
9592
10515
|
|
9593
10516
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -9737,10 +10660,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9737
10660
|
{
|
9738
10661
|
result = llm.build_qwen2();
|
9739
10662
|
} break;
|
10663
|
+
case LLM_ARCH_QWEN2MOE:
|
10664
|
+
{
|
10665
|
+
result = llm.build_qwen2moe();
|
10666
|
+
} break;
|
9740
10667
|
case LLM_ARCH_PHI2:
|
9741
10668
|
{
|
9742
10669
|
result = llm.build_phi2();
|
9743
10670
|
} break;
|
10671
|
+
case LLM_ARCH_PHI3:
|
10672
|
+
{
|
10673
|
+
result = llm.build_phi3();
|
10674
|
+
} break;
|
9744
10675
|
case LLM_ARCH_PLAMO:
|
9745
10676
|
{
|
9746
10677
|
result = llm.build_plamo();
|
@@ -9785,6 +10716,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9785
10716
|
{
|
9786
10717
|
result = llm.build_command_r();
|
9787
10718
|
} break;
|
10719
|
+
case LLM_ARCH_DBRX:
|
10720
|
+
{
|
10721
|
+
result = llm.build_dbrx();
|
10722
|
+
} break;
|
10723
|
+
case LLM_ARCH_OLMO:
|
10724
|
+
{
|
10725
|
+
result = llm.build_olmo();
|
10726
|
+
} break;
|
9788
10727
|
default:
|
9789
10728
|
GGML_ASSERT(false);
|
9790
10729
|
}
|
@@ -12556,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
12556
13495
|
GGML_ASSERT(ctx);
|
12557
13496
|
const int64_t t_start_sample_us = ggml_time_us();
|
12558
13497
|
|
12559
|
-
bool
|
13498
|
+
bool allow_eog = false;
|
12560
13499
|
for (const auto & stack : grammar->stacks) {
|
12561
13500
|
if (stack.empty()) {
|
12562
|
-
|
13501
|
+
allow_eog = true;
|
12563
13502
|
break;
|
12564
13503
|
}
|
12565
13504
|
}
|
12566
13505
|
|
12567
|
-
const llama_token eos = llama_token_eos(&ctx->model);
|
12568
|
-
|
12569
13506
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
12570
13507
|
candidates_decoded.reserve(candidates->size);
|
12571
13508
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
@@ -12573,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
12573
13510
|
|
12574
13511
|
for (size_t i = 0; i < candidates->size; ++i) {
|
12575
13512
|
const llama_token id = candidates->data[i].id;
|
12576
|
-
const std::string piece = llama_token_to_piece(ctx, id);
|
12577
|
-
|
12578
|
-
|
13513
|
+
const std::string piece = llama_token_to_piece(ctx, id, false);
|
13514
|
+
|
13515
|
+
if (llama_token_is_eog(&ctx->model, id)) {
|
13516
|
+
if (!allow_eog) {
|
12579
13517
|
candidates->data[i].logit = -INFINITY;
|
12580
13518
|
}
|
12581
13519
|
} else if (piece.empty() || piece[0] == 0) {
|
@@ -12738,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
12738
13676
|
return result;
|
12739
13677
|
}
|
12740
13678
|
|
12741
|
-
llama_token
|
13679
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
12742
13680
|
GGML_ASSERT(ctx);
|
12743
13681
|
|
12744
13682
|
const int64_t t_start_sample_us = ggml_time_us();
|
@@ -12751,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
12751
13689
|
}
|
12752
13690
|
|
12753
13691
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
12754
|
-
auto & rng = ctx->rng;
|
12755
13692
|
int idx = dist(rng);
|
12756
13693
|
|
12757
13694
|
llama_token result = candidates->data[idx].id;
|
@@ -12761,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
12761
13698
|
return result;
|
12762
13699
|
}
|
12763
13700
|
|
13701
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
13702
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
13703
|
+
}
|
13704
|
+
|
12764
13705
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
12765
13706
|
const int64_t t_start_sample_us = ggml_time_us();
|
12766
13707
|
|
12767
|
-
if (
|
13708
|
+
if (llama_token_is_eog(&ctx->model, token)) {
|
12768
13709
|
for (const auto & stack : grammar->stacks) {
|
12769
13710
|
if (stack.empty()) {
|
12770
13711
|
return;
|
@@ -12773,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
12773
13714
|
GGML_ASSERT(false);
|
12774
13715
|
}
|
12775
13716
|
|
12776
|
-
const std::string piece = llama_token_to_piece(ctx, token);
|
13717
|
+
const std::string piece = llama_token_to_piece(ctx, token, false);
|
12777
13718
|
|
12778
13719
|
// Note terminating 0 in decoded string
|
12779
13720
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -12915,6 +13856,11 @@ struct llama_beam_search_data {
|
|
12915
13856
|
}
|
12916
13857
|
llama_logit_info logit_info(ctx);
|
12917
13858
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
13859
|
+
|
13860
|
+
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
13861
|
+
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
13862
|
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
13863
|
+
|
12918
13864
|
size_t i=0;
|
12919
13865
|
if (next_beams.size() < n_beams) {
|
12920
13866
|
for (; next_beams.size() < n_beams ; ++i) {
|
@@ -13535,6 +14481,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13535
14481
|
gguf_set_kv (ctx_out, ml.meta);
|
13536
14482
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
13537
14483
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
14484
|
+
// Remove split metadata
|
14485
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
14486
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
14487
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
13538
14488
|
|
13539
14489
|
if (params->kv_overrides) {
|
13540
14490
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
@@ -13587,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13587
14537
|
std::vector<no_init<uint8_t>> work;
|
13588
14538
|
std::vector<no_init<float>> f32_conv_buf;
|
13589
14539
|
|
14540
|
+
uint16_t n_split = 1;
|
14541
|
+
// Assume split index is continuous
|
14542
|
+
if (params->keep_split) {
|
14543
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
14544
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
14545
|
+
}
|
14546
|
+
}
|
14547
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
14548
|
+
ctx_outs[0] = ctx_out;
|
14549
|
+
|
13590
14550
|
// populate the original tensors so we get an initial meta data
|
13591
14551
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
13592
|
-
|
13593
|
-
|
14552
|
+
auto weight = ml.get_weight(i);
|
14553
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
14554
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14555
|
+
if (ctx_outs[i_split] == NULL) {
|
14556
|
+
ctx_outs[i_split] = gguf_init_empty();
|
14557
|
+
}
|
14558
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
13594
14559
|
}
|
13595
14560
|
|
13596
|
-
|
13597
|
-
|
13598
|
-
|
13599
|
-
|
14561
|
+
// Set split info if needed
|
14562
|
+
if (n_split > 1) {
|
14563
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
14564
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
14565
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
14566
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
14567
|
+
}
|
14568
|
+
}
|
13600
14569
|
|
13601
|
-
|
14570
|
+
int cur_split = -1;
|
14571
|
+
std::ofstream fout;
|
14572
|
+
auto close_ofstream = [&]() {
|
14573
|
+
// Write metadata and close file handler
|
14574
|
+
if (fout.is_open()) {
|
14575
|
+
fout.seekp(0);
|
14576
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
14577
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
14578
|
+
fout.write((const char *) data.data(), data.size());
|
14579
|
+
fout.close();
|
14580
|
+
}
|
14581
|
+
};
|
14582
|
+
auto new_ofstream = [&](int index) {
|
14583
|
+
cur_split = index;
|
14584
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
14585
|
+
std::string fname = fname_out;
|
14586
|
+
if (params->keep_split) {
|
14587
|
+
char split_path[PATH_MAX] = {0};
|
14588
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
14589
|
+
fname = std::string(split_path);
|
14590
|
+
}
|
13602
14591
|
|
13603
|
-
|
13604
|
-
|
14592
|
+
fout = std::ofstream(fname, std::ios::binary);
|
14593
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
14594
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
14595
|
+
// placeholder for the meta data
|
14596
|
+
::zeros(fout, meta_size);
|
14597
|
+
};
|
13605
14598
|
|
13606
14599
|
const auto tn = LLM_TN(model.arch);
|
13607
|
-
|
14600
|
+
new_ofstream(0);
|
13608
14601
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
13609
|
-
|
14602
|
+
auto weight = ml.get_weight(i);
|
14603
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14604
|
+
if (weight->idx != cur_split && params->keep_split) {
|
14605
|
+
close_ofstream();
|
14606
|
+
new_ofstream(weight->idx);
|
14607
|
+
}
|
13610
14608
|
|
13611
14609
|
const std::string name = ggml_get_name(tensor);
|
13612
14610
|
|
@@ -13761,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13761
14759
|
total_size_new += new_size;
|
13762
14760
|
|
13763
14761
|
// update the gguf meta data as we go
|
13764
|
-
gguf_set_tensor_type(
|
13765
|
-
gguf_set_tensor_data(
|
14762
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
14763
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
13766
14764
|
|
13767
14765
|
// write tensor data + padding
|
13768
14766
|
fout.write((const char *) new_data, new_size);
|
13769
14767
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
13770
14768
|
}
|
13771
|
-
|
13772
|
-
|
13773
|
-
|
13774
|
-
fout.seekp(0);
|
13775
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
13776
|
-
gguf_get_meta_data(ctx_out, data.data());
|
13777
|
-
fout.write((const char *) data.data(), data.size());
|
14769
|
+
close_ofstream();
|
14770
|
+
for (auto & c:ctx_outs) {
|
14771
|
+
gguf_free(c);
|
13778
14772
|
}
|
13779
14773
|
|
13780
|
-
fout.close();
|
13781
|
-
|
13782
|
-
gguf_free(ctx_out);
|
13783
|
-
|
13784
14774
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
13785
14775
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
13786
14776
|
|
@@ -14136,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
14136
15126
|
/*.quantize_output_tensor =*/ true,
|
14137
15127
|
/*.only_copy =*/ false,
|
14138
15128
|
/*.pure =*/ false,
|
15129
|
+
/*.keep_split =*/ false,
|
14139
15130
|
/*.imatrix =*/ nullptr,
|
14140
15131
|
/*.kv_overrides =*/ nullptr,
|
14141
15132
|
};
|
@@ -14629,18 +15620,22 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14629
15620
|
case LLM_ARCH_MINICPM:
|
14630
15621
|
case LLM_ARCH_XVERSE:
|
14631
15622
|
case LLM_ARCH_COMMAND_R:
|
15623
|
+
case LLM_ARCH_OLMO:
|
14632
15624
|
return LLAMA_ROPE_TYPE_NORM;
|
14633
15625
|
|
14634
15626
|
// the pairs of head values are offset by n_rot/2
|
14635
15627
|
case LLM_ARCH_FALCON:
|
14636
15628
|
case LLM_ARCH_GROK:
|
15629
|
+
case LLM_ARCH_DBRX:
|
14637
15630
|
case LLM_ARCH_PERSIMMON:
|
14638
15631
|
case LLM_ARCH_BERT:
|
14639
15632
|
case LLM_ARCH_NOMIC_BERT:
|
14640
15633
|
case LLM_ARCH_STABLELM:
|
14641
15634
|
case LLM_ARCH_QWEN:
|
14642
15635
|
case LLM_ARCH_QWEN2:
|
15636
|
+
case LLM_ARCH_QWEN2MOE:
|
14643
15637
|
case LLM_ARCH_PHI2:
|
15638
|
+
case LLM_ARCH_PHI3:
|
14644
15639
|
case LLM_ARCH_GEMMA:
|
14645
15640
|
case LLM_ARCH_STARCODER2:
|
14646
15641
|
return LLAMA_ROPE_TYPE_NEOX;
|
@@ -14654,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14654
15649
|
return LLAMA_ROPE_TYPE_NONE;
|
14655
15650
|
}
|
14656
15651
|
|
15652
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
15653
|
+
return ctx->cparams.pooling_type;
|
15654
|
+
}
|
15655
|
+
|
14657
15656
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
14658
15657
|
return model->hparams.n_vocab;
|
14659
15658
|
}
|
@@ -15132,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
|
|
15132
16131
|
*
|
15133
16132
|
*/
|
15134
16133
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
16134
|
+
llama_synchronize(ctx);
|
16135
|
+
|
15135
16136
|
// copy rng
|
15136
16137
|
{
|
15137
16138
|
std::ostringstream rng_ss;
|
@@ -15284,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
15284
16285
|
|
15285
16286
|
// Sets the state reading from the specified source address
|
15286
16287
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
16288
|
+
llama_synchronize(ctx);
|
16289
|
+
|
15287
16290
|
const uint8_t * inp = src;
|
15288
16291
|
|
15289
16292
|
// set rng
|
@@ -15320,6 +16323,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15320
16323
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15321
16324
|
ctx->output_ids[id] = i;
|
15322
16325
|
}
|
16326
|
+
|
16327
|
+
ctx->n_outputs = n_outputs;
|
15323
16328
|
}
|
15324
16329
|
}
|
15325
16330
|
|
@@ -15586,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
15586
16591
|
}
|
15587
16592
|
|
15588
16593
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16594
|
+
llama_synchronize(ctx);
|
16595
|
+
|
15589
16596
|
const auto & kv_self = ctx->kv_self;
|
15590
16597
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15591
16598
|
|
@@ -15703,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
15703
16710
|
}
|
15704
16711
|
|
15705
16712
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16713
|
+
llama_synchronize(ctx);
|
16714
|
+
|
15706
16715
|
auto & kv_self = ctx->kv_self;
|
15707
16716
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15708
16717
|
|
@@ -16154,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|
16154
17163
|
return model->vocab.id_to_token[token].type;
|
16155
17164
|
}
|
16156
17165
|
|
17166
|
+
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
17167
|
+
return token != -1 && (
|
17168
|
+
token == llama_token_eos(model) ||
|
17169
|
+
token == llama_token_eot(model)
|
17170
|
+
);
|
17171
|
+
}
|
17172
|
+
|
16157
17173
|
llama_token llama_token_bos(const struct llama_model * model) {
|
16158
17174
|
return model->vocab.special_bos_id;
|
16159
17175
|
}
|
@@ -16231,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
16231
17247
|
}
|
16232
17248
|
|
16233
17249
|
// does not write null-terminator to buf
|
16234
|
-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
17250
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
16235
17251
|
if (0 <= token && token < llama_n_vocab(model)) {
|
16236
17252
|
switch (llama_vocab_get_type(model->vocab)) {
|
16237
17253
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -16246,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16246
17262
|
}
|
16247
17263
|
memcpy(buf, result.c_str(), result.length());
|
16248
17264
|
return result.length();
|
16249
|
-
} else if (
|
17265
|
+
} else if (
|
17266
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17267
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16250
17268
|
std::string result = model->vocab.id_to_token[token].text;
|
16251
17269
|
if (length < (int) result.length()) {
|
16252
17270
|
return -(int) result.length();
|
@@ -16259,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16259
17277
|
}
|
16260
17278
|
memcpy(buf, "\xe2\x96\x85", 3);
|
16261
17279
|
return 3;
|
16262
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16263
|
-
;
|
16264
17280
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
16265
17281
|
if (length < 1) {
|
16266
17282
|
return -1;
|
@@ -16281,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16281
17297
|
}
|
16282
17298
|
memcpy(buf, result.c_str(), result.length());
|
16283
17299
|
return result.length();
|
16284
|
-
} else if (
|
17300
|
+
} else if (
|
17301
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17302
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16285
17303
|
std::string result = model->vocab.id_to_token[token].text;
|
16286
17304
|
if (length < (int) result.length()) {
|
16287
17305
|
return -(int) result.length();
|
16288
17306
|
}
|
16289
17307
|
memcpy(buf, result.c_str(), result.length());
|
16290
17308
|
return result.length();
|
16291
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16292
|
-
;
|
16293
17309
|
}
|
16294
17310
|
break;
|
16295
17311
|
}
|
@@ -16472,6 +17488,39 @@ static int32_t llama_chat_apply_template_internal(
|
|
16472
17488
|
if (add_ass) {
|
16473
17489
|
ss << "### Response:\n";
|
16474
17490
|
}
|
17491
|
+
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
17492
|
+
// CohereForAI/c4ai-command-r-plus
|
17493
|
+
for (auto message : chat) {
|
17494
|
+
std::string role(message->role);
|
17495
|
+
if (role == "system") {
|
17496
|
+
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17497
|
+
} else if (role == "user") {
|
17498
|
+
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17499
|
+
} else if (role == "assistant") {
|
17500
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17501
|
+
}
|
17502
|
+
}
|
17503
|
+
if (add_ass) {
|
17504
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17505
|
+
}
|
17506
|
+
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
17507
|
+
// Llama 3
|
17508
|
+
for (auto message : chat) {
|
17509
|
+
std::string role(message->role);
|
17510
|
+
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
|
17511
|
+
}
|
17512
|
+
if (add_ass) {
|
17513
|
+
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17514
|
+
}
|
17515
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17516
|
+
// Phi 3
|
17517
|
+
for (auto message : chat) {
|
17518
|
+
std::string role(message->role);
|
17519
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17520
|
+
}
|
17521
|
+
if (add_ass) {
|
17522
|
+
ss << "<|assistant|>\n";
|
17523
|
+
}
|
16475
17524
|
} else {
|
16476
17525
|
// template not supported
|
16477
17526
|
return -1;
|
@@ -16604,6 +17653,11 @@ const char * llama_print_system_info(void) {
|
|
16604
17653
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
16605
17654
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
16606
17655
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
|
+
#ifdef GGML_USE_LLAMAFILE
|
17657
|
+
s += "LAMMAFILE = 1 | ";
|
17658
|
+
#else
|
17659
|
+
s += "LAMMAFILE = 0 | ";
|
17660
|
+
#endif
|
16607
17661
|
|
16608
17662
|
return s.c_str();
|
16609
17663
|
}
|