llama_cpp 0.14.5 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +24 -7
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +263 -5
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -294
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +151 -99
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +1308 -254
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +999 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -105,7 +105,7 @@
|
|
105
105
|
#endif
|
106
106
|
|
107
107
|
#define LLAMA_MAX_NODES 8192
|
108
|
-
#define LLAMA_MAX_EXPERTS
|
108
|
+
#define LLAMA_MAX_EXPERTS 60
|
109
109
|
|
110
110
|
|
111
111
|
//
|
@@ -209,7 +209,9 @@ enum llm_arch {
|
|
209
209
|
LLM_ARCH_STABLELM,
|
210
210
|
LLM_ARCH_QWEN,
|
211
211
|
LLM_ARCH_QWEN2,
|
212
|
+
LLM_ARCH_QWEN2MOE,
|
212
213
|
LLM_ARCH_PHI2,
|
214
|
+
LLM_ARCH_PHI3,
|
213
215
|
LLM_ARCH_PLAMO,
|
214
216
|
LLM_ARCH_CODESHELL,
|
215
217
|
LLM_ARCH_ORION,
|
@@ -220,6 +222,8 @@ enum llm_arch {
|
|
220
222
|
LLM_ARCH_MAMBA,
|
221
223
|
LLM_ARCH_XVERSE,
|
222
224
|
LLM_ARCH_COMMAND_R,
|
225
|
+
LLM_ARCH_DBRX,
|
226
|
+
LLM_ARCH_OLMO,
|
223
227
|
LLM_ARCH_UNKNOWN,
|
224
228
|
};
|
225
229
|
|
@@ -241,7 +245,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
241
245
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
242
246
|
{ LLM_ARCH_QWEN, "qwen" },
|
243
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
248
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
244
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
245
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
246
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
247
253
|
{ LLM_ARCH_ORION, "orion" },
|
@@ -252,6 +258,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
258
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
259
|
{ LLM_ARCH_XVERSE, "xverse" },
|
254
260
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
261
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
262
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
255
263
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
256
264
|
};
|
257
265
|
|
@@ -325,6 +333,10 @@ enum llm_kv {
|
|
325
333
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
326
334
|
LLM_KV_TOKENIZER_HF_JSON,
|
327
335
|
LLM_KV_TOKENIZER_RWKV,
|
336
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
337
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
338
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
339
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
328
340
|
};
|
329
341
|
|
330
342
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -397,6 +409,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
397
409
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
398
410
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
399
411
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
412
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
413
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
414
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
415
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
400
416
|
};
|
401
417
|
|
402
418
|
struct LLM_KV {
|
@@ -427,6 +443,7 @@ enum llm_tensor {
|
|
427
443
|
LLM_TENSOR_ATTN_OUT_NORM,
|
428
444
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
429
445
|
LLM_TENSOR_FFN_GATE_INP,
|
446
|
+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
430
447
|
LLM_TENSOR_FFN_NORM,
|
431
448
|
LLM_TENSOR_FFN_GATE,
|
432
449
|
LLM_TENSOR_FFN_DOWN,
|
@@ -438,6 +455,9 @@ enum llm_tensor {
|
|
438
455
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
439
456
|
LLM_TENSOR_FFN_GATE_EXPS,
|
440
457
|
LLM_TENSOR_FFN_UP_EXPS,
|
458
|
+
LLM_TENSOR_FFN_DOWN_SHEXP,
|
459
|
+
LLM_TENSOR_FFN_GATE_SHEXP,
|
460
|
+
LLM_TENSOR_FFN_UP_SHEXP,
|
441
461
|
LLM_TENSOR_ATTN_Q_NORM,
|
442
462
|
LLM_TENSOR_ATTN_K_NORM,
|
443
463
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -700,6 +720,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
700
720
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
701
721
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
702
722
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
723
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
724
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
703
725
|
},
|
704
726
|
},
|
705
727
|
{
|
@@ -735,6 +757,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
735
757
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
736
758
|
},
|
737
759
|
},
|
760
|
+
{
|
761
|
+
LLM_ARCH_QWEN2MOE,
|
762
|
+
{
|
763
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
764
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
765
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
766
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
767
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
768
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
769
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
770
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
771
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
772
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
773
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
774
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
775
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
776
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
777
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
778
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
779
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
780
|
+
},
|
781
|
+
},
|
738
782
|
{
|
739
783
|
LLM_ARCH_PHI2,
|
740
784
|
{
|
@@ -751,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
751
795
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
752
796
|
},
|
753
797
|
},
|
798
|
+
{
|
799
|
+
LLM_ARCH_PHI3,
|
800
|
+
{
|
801
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
802
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
803
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
804
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
805
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
806
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
807
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
808
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
809
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
810
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
811
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
812
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
813
|
+
},
|
814
|
+
},
|
754
815
|
{
|
755
816
|
LLM_ARCH_PLAMO,
|
756
817
|
{
|
@@ -934,6 +995,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
934
995
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
935
996
|
},
|
936
997
|
},
|
998
|
+
{
|
999
|
+
LLM_ARCH_DBRX,
|
1000
|
+
{
|
1001
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1002
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1003
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1004
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
1005
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1006
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1007
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
1008
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1009
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1010
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1011
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1012
|
+
},
|
1013
|
+
},
|
1014
|
+
{
|
1015
|
+
LLM_ARCH_OLMO,
|
1016
|
+
{
|
1017
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1018
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1019
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1020
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1021
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1022
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1023
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1024
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1025
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1026
|
+
},
|
1027
|
+
},
|
937
1028
|
{
|
938
1029
|
LLM_ARCH_UNKNOWN,
|
939
1030
|
{
|
@@ -1528,12 +1619,12 @@ struct llama_mlock {
|
|
1528
1619
|
};
|
1529
1620
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1530
1621
|
|
1531
|
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1622
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1532
1623
|
std::vector<char> result(8, 0);
|
1533
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1624
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1534
1625
|
if (n_tokens < 0) {
|
1535
1626
|
result.resize(-n_tokens);
|
1536
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1627
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1537
1628
|
GGML_ASSERT(check == -n_tokens);
|
1538
1629
|
}
|
1539
1630
|
else {
|
@@ -1690,6 +1781,7 @@ enum e_model {
|
|
1690
1781
|
MODEL_4B,
|
1691
1782
|
MODEL_7B,
|
1692
1783
|
MODEL_8B,
|
1784
|
+
MODEL_12B,
|
1693
1785
|
MODEL_13B,
|
1694
1786
|
MODEL_14B,
|
1695
1787
|
MODEL_15B,
|
@@ -1705,8 +1797,10 @@ enum e_model {
|
|
1705
1797
|
MODEL_MEDIUM,
|
1706
1798
|
MODEL_LARGE,
|
1707
1799
|
MODEL_XL,
|
1800
|
+
MODEL_A2_7B,
|
1708
1801
|
MODEL_8x7B,
|
1709
1802
|
MODEL_8x22B,
|
1803
|
+
MODEL_16x12B,
|
1710
1804
|
};
|
1711
1805
|
|
1712
1806
|
static const size_t kiB = 1024;
|
@@ -1890,6 +1984,12 @@ struct llama_layer {
|
|
1890
1984
|
struct ggml_tensor * ffn_down_exps;
|
1891
1985
|
struct ggml_tensor * ffn_up_exps ;
|
1892
1986
|
|
1987
|
+
// ff shared expert (shexp)
|
1988
|
+
struct ggml_tensor * ffn_gate_inp_shexp;
|
1989
|
+
struct ggml_tensor * ffn_gate_shexp;
|
1990
|
+
struct ggml_tensor * ffn_down_shexp;
|
1991
|
+
struct ggml_tensor * ffn_up_shexp;
|
1992
|
+
|
1893
1993
|
// ff bias
|
1894
1994
|
struct ggml_tensor * ffn_down_b; // b2
|
1895
1995
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -2036,10 +2136,10 @@ struct llama_vocab {
|
|
2036
2136
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2037
2137
|
|
2038
2138
|
id linefeed_id = 13;
|
2039
|
-
id special_prefix_id =
|
2040
|
-
id
|
2041
|
-
id
|
2042
|
-
id special_eot_id =
|
2139
|
+
id special_prefix_id = -1;
|
2140
|
+
id special_suffix_id = -1;
|
2141
|
+
id special_middle_id = -1;
|
2142
|
+
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2043
2143
|
|
2044
2144
|
bool add_space_prefix = true;
|
2045
2145
|
|
@@ -2899,9 +2999,13 @@ struct llama_model_loader {
|
|
2899
2999
|
|
2900
3000
|
ggml_tensor * tensor;
|
2901
3001
|
|
2902
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
3002
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2903
3003
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2904
3004
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
3005
|
+
|
3006
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
3007
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
3008
|
+
}
|
2905
3009
|
}
|
2906
3010
|
};
|
2907
3011
|
std::vector<llama_tensor_weight> weights;
|
@@ -2940,15 +3044,15 @@ struct llama_model_loader {
|
|
2940
3044
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2941
3045
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2942
3046
|
|
3047
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3048
|
+
contexts.emplace_back(ctx);
|
3049
|
+
|
2943
3050
|
// Save tensors data offset of the main file.
|
2944
3051
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
2945
3052
|
// so we build a unified tensors index for weights.
|
2946
3053
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2947
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
3054
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
2948
3055
|
}
|
2949
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2950
|
-
contexts.emplace_back(ctx);
|
2951
|
-
|
2952
3056
|
uint16_t n_split = 0;
|
2953
3057
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2954
3058
|
|
@@ -2982,12 +3086,13 @@ struct llama_model_loader {
|
|
2982
3086
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2983
3087
|
}
|
2984
3088
|
|
3089
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
3090
|
+
contexts.emplace_back(ctx);
|
3091
|
+
|
2985
3092
|
// Save tensors data offset info of the shard.
|
2986
3093
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2987
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
3094
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
2988
3095
|
}
|
2989
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
2990
|
-
contexts.emplace_back(ctx);
|
2991
3096
|
|
2992
3097
|
gguf_free(ctx_gguf);
|
2993
3098
|
}
|
@@ -3197,6 +3302,10 @@ struct llama_model_loader {
|
|
3197
3302
|
return nullptr;
|
3198
3303
|
}
|
3199
3304
|
|
3305
|
+
const llama_tensor_weight * get_weight(int i) const {
|
3306
|
+
return get_weight(get_tensor_name(i));
|
3307
|
+
}
|
3308
|
+
|
3200
3309
|
const llama_tensor_weight & require_weight(const char * name) const {
|
3201
3310
|
const llama_tensor_weight * weight = get_weight(name);
|
3202
3311
|
if (!weight) {
|
@@ -3545,6 +3654,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3545
3654
|
case MODEL_3B: return "3B";
|
3546
3655
|
case MODEL_7B: return "7B";
|
3547
3656
|
case MODEL_8B: return "8B";
|
3657
|
+
case MODEL_12B: return "12B";
|
3548
3658
|
case MODEL_13B: return "13B";
|
3549
3659
|
case MODEL_14B: return "14B";
|
3550
3660
|
case MODEL_15B: return "15B";
|
@@ -3560,8 +3670,10 @@ static const char * llama_model_type_name(e_model type) {
|
|
3560
3670
|
case MODEL_MEDIUM: return "0.4B";
|
3561
3671
|
case MODEL_LARGE: return "0.8B";
|
3562
3672
|
case MODEL_XL: return "1.5B";
|
3673
|
+
case MODEL_A2_7B: return "A2.7B";
|
3563
3674
|
case MODEL_8x7B: return "8x7B";
|
3564
3675
|
case MODEL_8x22B: return "8x22B";
|
3676
|
+
case MODEL_16x12B: return "16x12B";
|
3565
3677
|
default: return "?B";
|
3566
3678
|
}
|
3567
3679
|
}
|
@@ -3686,7 +3798,7 @@ static void llm_load_hparams(
|
|
3686
3798
|
switch (hparams.n_layer) {
|
3687
3799
|
case 22: model.type = e_model::MODEL_1B; break;
|
3688
3800
|
case 26: model.type = e_model::MODEL_3B; break;
|
3689
|
-
case 32: model.type = e_model::MODEL_7B; break;
|
3801
|
+
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
3690
3802
|
case 40: model.type = e_model::MODEL_13B; break;
|
3691
3803
|
case 48: model.type = e_model::MODEL_34B; break;
|
3692
3804
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3834,6 +3946,7 @@ static void llm_load_hparams(
|
|
3834
3946
|
switch (hparams.n_layer) {
|
3835
3947
|
case 24: model.type = e_model::MODEL_1B; break;
|
3836
3948
|
case 32: model.type = e_model::MODEL_3B; break;
|
3949
|
+
case 40: model.type = e_model::MODEL_12B; break;
|
3837
3950
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3838
3951
|
}
|
3839
3952
|
} break;
|
@@ -3858,10 +3971,28 @@ static void llm_load_hparams(
|
|
3858
3971
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3859
3972
|
}
|
3860
3973
|
} break;
|
3974
|
+
case LLM_ARCH_QWEN2MOE:
|
3975
|
+
{
|
3976
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3977
|
+
switch (hparams.n_layer) {
|
3978
|
+
case 24: model.type = e_model::MODEL_A2_7B; break;
|
3979
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3980
|
+
}
|
3981
|
+
} break;
|
3861
3982
|
case LLM_ARCH_PHI2:
|
3862
3983
|
{
|
3863
3984
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3864
3985
|
|
3986
|
+
switch (hparams.n_layer) {
|
3987
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
3988
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
3989
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3990
|
+
}
|
3991
|
+
} break;
|
3992
|
+
case LLM_ARCH_PHI3:
|
3993
|
+
{
|
3994
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3995
|
+
|
3865
3996
|
switch (hparams.n_layer) {
|
3866
3997
|
case 24: model.type = e_model::MODEL_1B; break;
|
3867
3998
|
case 32: model.type = e_model::MODEL_3B; break;
|
@@ -3983,6 +4114,28 @@ static void llm_load_hparams(
|
|
3983
4114
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3984
4115
|
}
|
3985
4116
|
} break;
|
4117
|
+
case LLM_ARCH_DBRX:
|
4118
|
+
{
|
4119
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4120
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
4121
|
+
|
4122
|
+
switch (hparams.n_layer) {
|
4123
|
+
case 40: model.type = e_model::MODEL_16x12B; break;
|
4124
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4125
|
+
}
|
4126
|
+
} break;
|
4127
|
+
case LLM_ARCH_OLMO:
|
4128
|
+
{
|
4129
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4130
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
4131
|
+
|
4132
|
+
switch (hparams.n_layer) {
|
4133
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
4134
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
4135
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
4136
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4137
|
+
}
|
4138
|
+
} break;
|
3986
4139
|
default: (void)0;
|
3987
4140
|
}
|
3988
4141
|
|
@@ -4042,6 +4195,35 @@ static void llm_load_vocab(
|
|
4042
4195
|
vocab.special_cls_id = -1;
|
4043
4196
|
vocab.special_mask_id = -1;
|
4044
4197
|
|
4198
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4199
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4200
|
+
// will allow those models to continue to work. The general names
|
4201
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4202
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4203
|
+
// new versions of these models have been published.
|
4204
|
+
std::string gen_name;
|
4205
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4206
|
+
|
4207
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4208
|
+
[](unsigned char c){ return std::tolower(c); });
|
4209
|
+
|
4210
|
+
if (gen_name.find("code") != std::string::npos) {
|
4211
|
+
if (model.arch == LLM_ARCH_LLAMA) {
|
4212
|
+
vocab.special_prefix_id = 32007;
|
4213
|
+
vocab.special_suffix_id = 32008;
|
4214
|
+
vocab.special_middle_id = 32009;
|
4215
|
+
vocab.special_eot_id = 32010;
|
4216
|
+
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4217
|
+
vocab.special_prefix_id = 67;
|
4218
|
+
vocab.special_suffix_id = 69;
|
4219
|
+
vocab.special_middle_id = 68;
|
4220
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4221
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4222
|
+
//vocab.special_eot_id = 70;
|
4223
|
+
vocab.special_eot_id = 107;
|
4224
|
+
}
|
4225
|
+
}
|
4226
|
+
|
4045
4227
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4046
4228
|
if (add_space_prefix_keyidx != -1) {
|
4047
4229
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
@@ -4155,14 +4337,19 @@ static void llm_load_vocab(
|
|
4155
4337
|
// special tokens
|
4156
4338
|
{
|
4157
4339
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4158
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4159
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4160
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4161
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4162
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4163
|
-
{ LLM_KV_TOKENIZER_CLS_ID,
|
4164
|
-
{ LLM_KV_TOKENIZER_MASK_ID,
|
4340
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4341
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4342
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4343
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4344
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4345
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4346
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4347
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
4348
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
4349
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4350
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4165
4351
|
};
|
4352
|
+
|
4166
4353
|
for (const auto & it : special_token_types) {
|
4167
4354
|
const std::string & key = kv(std::get<0>(it));
|
4168
4355
|
int32_t & id = std::get<1>(it);
|
@@ -4177,7 +4364,6 @@ static void llm_load_vocab(
|
|
4177
4364
|
} else {
|
4178
4365
|
id = new_id;
|
4179
4366
|
}
|
4180
|
-
|
4181
4367
|
}
|
4182
4368
|
|
4183
4369
|
// Handle add_bos_token and add_eos_token
|
@@ -4191,6 +4377,28 @@ static void llm_load_vocab(
|
|
4191
4377
|
vocab.special_add_eos = int(temp);
|
4192
4378
|
}
|
4193
4379
|
}
|
4380
|
+
|
4381
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
4382
|
+
//
|
4383
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
4384
|
+
// for now, we apply this workaround to find the EOT token based on its text
|
4385
|
+
if (vocab.special_eot_id == -1) {
|
4386
|
+
for (const auto & t : vocab.token_to_id) {
|
4387
|
+
if (
|
4388
|
+
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
4389
|
+
// need to fix convert script
|
4390
|
+
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
4391
|
+
(t.first == "<|eot_id|>" ||
|
4392
|
+
t.first == "<|im_end|>" ||
|
4393
|
+
t.first == "<|end|>" ||
|
4394
|
+
t.first == "<end_of_turn>"
|
4395
|
+
)
|
4396
|
+
) {
|
4397
|
+
vocab.special_eot_id = t.second;
|
4398
|
+
break;
|
4399
|
+
}
|
4400
|
+
}
|
4401
|
+
}
|
4194
4402
|
}
|
4195
4403
|
|
4196
4404
|
// build special tokens cache
|
@@ -4353,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4353
4561
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4354
4562
|
|
4355
4563
|
// special tokens
|
4356
|
-
if (vocab.special_bos_id
|
4357
|
-
if (vocab.special_eos_id
|
4358
|
-
if (vocab.special_unk_id
|
4359
|
-
if (vocab.special_sep_id
|
4360
|
-
if (vocab.special_pad_id
|
4361
|
-
if (vocab.special_cls_id
|
4362
|
-
if (vocab.special_mask_id
|
4363
|
-
|
4564
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4565
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4566
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4567
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4568
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4569
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4570
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4571
|
+
|
4572
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4573
|
+
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
4574
|
+
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4575
|
+
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4576
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4364
4577
|
}
|
4365
4578
|
|
4366
4579
|
// Returns false if cancelled by progress_callback
|
@@ -4378,6 +4591,13 @@ static bool llm_load_tensors(
|
|
4378
4591
|
|
4379
4592
|
auto & hparams = model.hparams;
|
4380
4593
|
|
4594
|
+
#ifdef GGML_USE_SYCL
|
4595
|
+
// disable MoE with SYCL until mul_mat_id is updated
|
4596
|
+
if (hparams.n_expert > 0) {
|
4597
|
+
n_gpu_layers = 0;
|
4598
|
+
}
|
4599
|
+
#endif
|
4600
|
+
|
4381
4601
|
model.split_mode = split_mode;
|
4382
4602
|
model.main_gpu = main_gpu;
|
4383
4603
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -4475,7 +4695,7 @@ static bool llm_load_tensors(
|
|
4475
4695
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4476
4696
|
|
4477
4697
|
// for moe merged tensors
|
4478
|
-
ctx_size += ggml_tensor_overhead()*
|
4698
|
+
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
4479
4699
|
|
4480
4700
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4481
4701
|
for (auto & it : buft_layer_count) {
|
@@ -4671,6 +4891,39 @@ static bool llm_load_tensors(
|
|
4671
4891
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4672
4892
|
}
|
4673
4893
|
} break;
|
4894
|
+
case LLM_ARCH_DBRX:
|
4895
|
+
{
|
4896
|
+
if (n_expert == 0) {
|
4897
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
4898
|
+
}
|
4899
|
+
|
4900
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4901
|
+
|
4902
|
+
// output
|
4903
|
+
{
|
4904
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4905
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4906
|
+
}
|
4907
|
+
|
4908
|
+
for (int i = 0; i < n_layer; ++i) {
|
4909
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4910
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4911
|
+
|
4912
|
+
auto & layer = model.layers[i];
|
4913
|
+
|
4914
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4915
|
+
|
4916
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4917
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4918
|
+
|
4919
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4920
|
+
|
4921
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4922
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4923
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
4924
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4925
|
+
}
|
4926
|
+
} break;
|
4674
4927
|
case LLM_ARCH_BAICHUAN:
|
4675
4928
|
{
|
4676
4929
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -4985,8 +5238,13 @@ static bool llm_load_tensors(
|
|
4985
5238
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
4986
5239
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
4987
5240
|
|
4988
|
-
|
4989
|
-
layer.
|
5241
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
5242
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
5243
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
5244
|
+
|
5245
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5246
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
5247
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4990
5248
|
|
4991
5249
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4992
5250
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5029,7 +5287,13 @@ static bool llm_load_tensors(
|
|
5029
5287
|
// output
|
5030
5288
|
{
|
5031
5289
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5032
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5290
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5291
|
+
// if output is NULL, init from the input tok embed
|
5292
|
+
if (model.output == NULL) {
|
5293
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5294
|
+
ml.n_created--; // artificial tensor
|
5295
|
+
ml.size_data += ggml_nbytes(model.output);
|
5296
|
+
}
|
5033
5297
|
}
|
5034
5298
|
|
5035
5299
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5057,6 +5321,54 @@ static bool llm_load_tensors(
|
|
5057
5321
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5058
5322
|
}
|
5059
5323
|
} break;
|
5324
|
+
case LLM_ARCH_QWEN2MOE:
|
5325
|
+
{
|
5326
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5327
|
+
|
5328
|
+
// output
|
5329
|
+
{
|
5330
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5331
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5332
|
+
}
|
5333
|
+
|
5334
|
+
for (int i = 0; i < n_layer; ++i) {
|
5335
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5336
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5337
|
+
|
5338
|
+
auto & layer = model.layers[i];
|
5339
|
+
|
5340
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5341
|
+
|
5342
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5343
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5344
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5345
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5346
|
+
|
5347
|
+
// optional bias tensors
|
5348
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5349
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5350
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5351
|
+
|
5352
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5353
|
+
|
5354
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5355
|
+
|
5356
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
5357
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
5358
|
+
|
5359
|
+
// MoE branch
|
5360
|
+
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
5361
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5362
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5363
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5364
|
+
|
5365
|
+
// Shared expert branch
|
5366
|
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5367
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
5368
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
5369
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
5370
|
+
}
|
5371
|
+
} break;
|
5060
5372
|
case LLM_ARCH_PHI2:
|
5061
5373
|
{
|
5062
5374
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5102,6 +5414,33 @@ static bool llm_load_tensors(
|
|
5102
5414
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5103
5415
|
}
|
5104
5416
|
} break;
|
5417
|
+
case LLM_ARCH_PHI3:
|
5418
|
+
{
|
5419
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
5420
|
+
|
5421
|
+
// output
|
5422
|
+
{
|
5423
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
5424
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
5425
|
+
}
|
5426
|
+
|
5427
|
+
for (int i = 0; i < n_layer; ++i) {
|
5428
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
5429
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5430
|
+
|
5431
|
+
auto& layer = model.layers[i];
|
5432
|
+
|
5433
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5434
|
+
|
5435
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
5436
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5437
|
+
|
5438
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5439
|
+
|
5440
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5441
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5442
|
+
}
|
5443
|
+
} break;
|
5105
5444
|
case LLM_ARCH_PLAMO:
|
5106
5445
|
{
|
5107
5446
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5450,6 +5789,37 @@ static bool llm_load_tensors(
|
|
5450
5789
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5451
5790
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5452
5791
|
|
5792
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5793
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5794
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5795
|
+
}
|
5796
|
+
} break;
|
5797
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
5798
|
+
{
|
5799
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5800
|
+
|
5801
|
+
// output
|
5802
|
+
{
|
5803
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5804
|
+
// if output is NULL, init from the input tok embed
|
5805
|
+
if (model.output == NULL) {
|
5806
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5807
|
+
ml.n_created--; // artificial tensor
|
5808
|
+
ml.size_data += ggml_nbytes(model.output);
|
5809
|
+
}
|
5810
|
+
}
|
5811
|
+
|
5812
|
+
for (int i = 0; i < n_layer; ++i) {
|
5813
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5814
|
+
|
5815
|
+
auto & layer = model.layers[i];
|
5816
|
+
|
5817
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5818
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5819
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5820
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5821
|
+
|
5822
|
+
|
5453
5823
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5454
5824
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5455
5825
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
@@ -5890,6 +6260,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
5890
6260
|
return cur;
|
5891
6261
|
}
|
5892
6262
|
|
6263
|
+
static struct ggml_tensor * llm_build_moe_ffn(
|
6264
|
+
struct ggml_context * ctx,
|
6265
|
+
struct ggml_tensor * cur,
|
6266
|
+
struct ggml_tensor * gate_inp,
|
6267
|
+
struct ggml_tensor * up_exps,
|
6268
|
+
struct ggml_tensor * gate_exps,
|
6269
|
+
struct ggml_tensor * down_exps,
|
6270
|
+
int64_t n_expert,
|
6271
|
+
int64_t n_expert_used,
|
6272
|
+
llm_ffn_op_type type_op,
|
6273
|
+
bool norm_w,
|
6274
|
+
const llm_build_cb & cb,
|
6275
|
+
int il) {
|
6276
|
+
int64_t n_embd = cur->ne[0];
|
6277
|
+
int64_t n_tokens = cur->ne[1];
|
6278
|
+
|
6279
|
+
ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
6280
|
+
cb(logits, "ffn_moe_logits", il);
|
6281
|
+
|
6282
|
+
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
6283
|
+
cb(probs, "ffn_moe_probs", il);
|
6284
|
+
|
6285
|
+
// select experts
|
6286
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
6287
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6288
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
6289
|
+
|
6290
|
+
ggml_tensor * weights = ggml_get_rows(ctx,
|
6291
|
+
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
6292
|
+
cb(weights, "ffn_moe_weights", il);
|
6293
|
+
|
6294
|
+
if (norm_w) {
|
6295
|
+
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
6296
|
+
|
6297
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
6298
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6299
|
+
|
6300
|
+
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
6301
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6302
|
+
|
6303
|
+
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6304
|
+
}
|
6305
|
+
|
6306
|
+
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6307
|
+
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6308
|
+
cb(up, "ffn_moe_up", il);
|
6309
|
+
|
6310
|
+
ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6311
|
+
cb(gate, "ffn_moe_gate", il);
|
6312
|
+
|
6313
|
+
switch (type_op) {
|
6314
|
+
case LLM_FFN_SILU:
|
6315
|
+
{
|
6316
|
+
gate = ggml_silu(ctx, gate);
|
6317
|
+
cb(gate, "ffn_moe_silu", il);
|
6318
|
+
} break;
|
6319
|
+
case LLM_FFN_GELU:
|
6320
|
+
{
|
6321
|
+
gate = ggml_gelu(ctx, gate);
|
6322
|
+
cb(gate, "ffn_moe_gelu", il);
|
6323
|
+
} break;
|
6324
|
+
default:
|
6325
|
+
GGML_ASSERT(false);
|
6326
|
+
}
|
6327
|
+
|
6328
|
+
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
6329
|
+
cb(par, "ffn_moe_gate_par", il);
|
6330
|
+
|
6331
|
+
ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
6332
|
+
cb(experts, "ffn_moe_down", il);
|
6333
|
+
|
6334
|
+
experts = ggml_mul(ctx, experts, weights);
|
6335
|
+
|
6336
|
+
// aggregate experts
|
6337
|
+
ggml_tensor * moe_out = nullptr;
|
6338
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6339
|
+
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
6340
|
+
experts->nb[2], i*experts->nb[1]);
|
6341
|
+
|
6342
|
+
if (i == 0) {
|
6343
|
+
moe_out = cur_expert;
|
6344
|
+
} else {
|
6345
|
+
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
6346
|
+
}
|
6347
|
+
}
|
6348
|
+
|
6349
|
+
if (n_expert_used == 1) {
|
6350
|
+
// avoid returning a non-contiguous tensor
|
6351
|
+
moe_out = ggml_cont(ctx, moe_out);
|
6352
|
+
}
|
6353
|
+
|
6354
|
+
return moe_out;
|
6355
|
+
}
|
6356
|
+
|
5893
6357
|
// if max_alibi_bias > 0 then apply ALiBi
|
5894
6358
|
static struct ggml_tensor * llm_build_kqv(
|
5895
6359
|
struct ggml_context * ctx,
|
@@ -5928,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5928
6392
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
5929
6393
|
cb(kq, "kq", il);
|
5930
6394
|
|
5931
|
-
if (model.arch == LLM_ARCH_PHI2) {
|
6395
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
5932
6396
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
5933
6397
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
5934
6398
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6433,63 +6897,16 @@ struct llm_build_context {
|
|
6433
6897
|
LLM_NORM_RMS, cb, il);
|
6434
6898
|
cb(cur, "ffn_norm", il);
|
6435
6899
|
|
6436
|
-
|
6437
|
-
|
6438
|
-
|
6439
|
-
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
cb(
|
6445
|
-
|
6446
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6447
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6448
|
-
cb(weights, "ffn_moe_weights", il);
|
6449
|
-
|
6450
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6451
|
-
|
6452
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6453
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6454
|
-
|
6455
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6456
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6457
|
-
|
6458
|
-
// compute expert outputs
|
6459
|
-
ggml_tensor * moe_out = nullptr;
|
6460
|
-
|
6461
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6462
|
-
ggml_tensor * cur_expert;
|
6463
|
-
|
6464
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6465
|
-
cb(cur_up, "ffn_moe_up", il);
|
6466
|
-
|
6467
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6468
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6469
|
-
|
6470
|
-
cur_gate = ggml_silu(ctx0, cur_gate);
|
6471
|
-
cb(cur_gate, "ffn_moe_silu", il);
|
6472
|
-
|
6473
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6474
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6475
|
-
|
6476
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6477
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6478
|
-
|
6479
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6480
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6481
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6482
|
-
|
6483
|
-
if (i == 0) {
|
6484
|
-
moe_out = cur_expert;
|
6485
|
-
} else {
|
6486
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6487
|
-
cb(moe_out, "ffn_moe_out", il);
|
6488
|
-
}
|
6489
|
-
}
|
6490
|
-
|
6491
|
-
cur = moe_out;
|
6492
|
-
}
|
6900
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
6901
|
+
model.layers[il].ffn_gate_inp,
|
6902
|
+
model.layers[il].ffn_up_exps,
|
6903
|
+
model.layers[il].ffn_gate_exps,
|
6904
|
+
model.layers[il].ffn_down_exps,
|
6905
|
+
n_expert, n_expert_used,
|
6906
|
+
LLM_FFN_SILU, true,
|
6907
|
+
cb, il);
|
6908
|
+
cb(cur, "ffn_moe_out", il);
|
6909
|
+
}
|
6493
6910
|
|
6494
6911
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
6495
6912
|
cb(cur, "ffn_out", il);
|
@@ -6967,74 +7384,158 @@ struct llm_build_context {
|
|
6967
7384
|
LLM_NORM_RMS, cb, il);
|
6968
7385
|
cb(cur, "ffn_norm", il);
|
6969
7386
|
|
6970
|
-
|
6971
|
-
|
7387
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7388
|
+
model.layers[il].ffn_gate_inp,
|
7389
|
+
model.layers[il].ffn_up_exps,
|
7390
|
+
model.layers[il].ffn_gate_exps,
|
7391
|
+
model.layers[il].ffn_down_exps,
|
7392
|
+
n_expert, n_expert_used,
|
7393
|
+
LLM_FFN_GELU, true,
|
7394
|
+
cb, il);
|
7395
|
+
cb(cur, "ffn_moe_out", il);
|
7396
|
+
|
7397
|
+
// Grok
|
7398
|
+
// if layer_out_norm is present then apply it before adding the input
|
7399
|
+
// Idea: maybe ffn_out_norm is a better name
|
7400
|
+
if (model.layers[il].layer_out_norm) {
|
7401
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7402
|
+
model.layers[il].layer_out_norm, NULL,
|
7403
|
+
LLM_NORM_RMS, cb, il);
|
7404
|
+
cb(cur, "layer_out_norm", il);
|
7405
|
+
}
|
7406
|
+
|
7407
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7408
|
+
cb(cur, "ffn_out", il);
|
7409
|
+
|
7410
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7411
|
+
if (layer_dir != nullptr) {
|
7412
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7413
|
+
}
|
7414
|
+
cb(cur, "l_out", il);
|
7415
|
+
|
7416
|
+
// input for next layer
|
7417
|
+
inpL = cur;
|
7418
|
+
}
|
7419
|
+
|
7420
|
+
cur = inpL;
|
7421
|
+
|
7422
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7423
|
+
model.output_norm, NULL,
|
7424
|
+
LLM_NORM_RMS, cb, -1);
|
7425
|
+
cb(cur, "result_norm", -1);
|
7426
|
+
|
7427
|
+
// lm_head
|
7428
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7429
|
+
|
7430
|
+
// Grok
|
7431
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
6972
7432
|
|
6973
|
-
|
6974
|
-
cb(probs, "ffn_moe_probs", il);
|
7433
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
6975
7434
|
|
6976
|
-
|
6977
|
-
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6978
|
-
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
7435
|
+
cb(cur, "result_output", -1);
|
6979
7436
|
|
6980
|
-
|
6981
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6982
|
-
cb(weights, "ffn_moe_weights", il);
|
7437
|
+
ggml_build_forward_expand(gf, cur);
|
6983
7438
|
|
6984
|
-
|
7439
|
+
return gf;
|
7440
|
+
}
|
7441
|
+
|
7442
|
+
struct ggml_cgraph * build_dbrx() {
|
7443
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6985
7444
|
|
6986
|
-
|
6987
|
-
|
7445
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7446
|
+
int32_t n_tokens = this->n_tokens;
|
6988
7447
|
|
6989
|
-
|
6990
|
-
|
7448
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7449
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7450
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7451
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6991
7452
|
|
6992
|
-
|
6993
|
-
|
7453
|
+
struct ggml_tensor * cur;
|
7454
|
+
struct ggml_tensor * inpL;
|
6994
7455
|
|
6995
|
-
|
6996
|
-
ggml_tensor * cur_expert;
|
7456
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6997
7457
|
|
6998
|
-
|
6999
|
-
|
7458
|
+
// inp_pos - contains the positions
|
7459
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7000
7460
|
|
7001
|
-
|
7002
|
-
|
7461
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7462
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7003
7463
|
|
7004
|
-
|
7005
|
-
|
7006
|
-
cb(cur_gate, "ffn_moe_gelu", il);
|
7464
|
+
for (int il = 0; il < n_layer; ++il) {
|
7465
|
+
struct ggml_tensor * inpSA = inpL;
|
7007
7466
|
|
7008
|
-
|
7009
|
-
|
7467
|
+
// norm
|
7468
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7469
|
+
model.layers[il].attn_norm, NULL,
|
7470
|
+
LLM_NORM, cb, il);
|
7471
|
+
cb(cur, "attn_norm", il);
|
7010
7472
|
|
7011
|
-
|
7012
|
-
|
7473
|
+
// self-attention
|
7474
|
+
{
|
7475
|
+
struct ggml_tensor * Qcur = nullptr;
|
7476
|
+
struct ggml_tensor * Kcur = nullptr;
|
7477
|
+
struct ggml_tensor * Vcur = nullptr;
|
7013
7478
|
|
7014
|
-
|
7015
|
-
|
7016
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
7479
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7480
|
+
cb(cur, "wqkv", il);
|
7017
7481
|
|
7018
|
-
|
7019
|
-
|
7020
|
-
} else {
|
7021
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
7022
|
-
cb(moe_out, "ffn_moe_out", il);
|
7023
|
-
}
|
7024
|
-
}
|
7482
|
+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
7483
|
+
cb(cur, "wqkv_clamped", il);
|
7025
7484
|
|
7026
|
-
|
7485
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7486
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7487
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7027
7488
|
|
7028
|
-
|
7029
|
-
|
7030
|
-
|
7031
|
-
|
7032
|
-
|
7033
|
-
|
7034
|
-
|
7035
|
-
|
7489
|
+
cb(Qcur, "Qcur", il);
|
7490
|
+
cb(Kcur, "Kcur", il);
|
7491
|
+
cb(Vcur, "Vcur", il);
|
7492
|
+
|
7493
|
+
Qcur = ggml_rope_custom(
|
7494
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7495
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7496
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7497
|
+
);
|
7498
|
+
cb(Qcur, "Qcur", il);
|
7499
|
+
|
7500
|
+
Kcur = ggml_rope_custom(
|
7501
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7502
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7503
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7504
|
+
);
|
7505
|
+
cb(Kcur, "Kcur", il);
|
7506
|
+
|
7507
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
+
model.layers[il].wo, NULL,
|
7509
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7036
7510
|
}
|
7037
7511
|
|
7512
|
+
if (il == n_layer - 1) {
|
7513
|
+
// skip computing output for unused tokens
|
7514
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7515
|
+
n_tokens = n_outputs;
|
7516
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7517
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7518
|
+
}
|
7519
|
+
|
7520
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7521
|
+
cb(ffn_inp, "ffn_inp", il);
|
7522
|
+
|
7523
|
+
// feed-forward network
|
7524
|
+
// MoE branch
|
7525
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7526
|
+
model.layers[il].attn_out_norm, NULL,
|
7527
|
+
LLM_NORM, cb, il);
|
7528
|
+
cb(cur, "attn_out_norm", il);
|
7529
|
+
|
7530
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7531
|
+
model.layers[il].ffn_gate_inp,
|
7532
|
+
model.layers[il].ffn_up_exps,
|
7533
|
+
model.layers[il].ffn_gate_exps,
|
7534
|
+
model.layers[il].ffn_down_exps,
|
7535
|
+
n_expert, n_expert_used,
|
7536
|
+
LLM_FFN_SILU, true,
|
7537
|
+
cb, il);
|
7538
|
+
cb(cur, "ffn_moe_out", il);
|
7038
7539
|
|
7039
7540
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
7040
7541
|
cb(cur, "ffn_out", il);
|
@@ -7052,18 +7553,13 @@ struct llm_build_context {
|
|
7052
7553
|
cur = inpL;
|
7053
7554
|
|
7054
7555
|
cur = llm_build_norm(ctx0, cur, hparams,
|
7055
|
-
|
7056
|
-
|
7556
|
+
model.output_norm, NULL,
|
7557
|
+
LLM_NORM, cb, -1);
|
7057
7558
|
cb(cur, "result_norm", -1);
|
7058
7559
|
|
7059
7560
|
// lm_head
|
7060
7561
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7061
7562
|
|
7062
|
-
// Grok
|
7063
|
-
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7064
|
-
|
7065
|
-
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7066
|
-
|
7067
7563
|
cb(cur, "result_output", -1);
|
7068
7564
|
|
7069
7565
|
ggml_build_forward_expand(gf, cur);
|
@@ -7923,7 +8419,7 @@ struct llm_build_context {
|
|
7923
8419
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7924
8420
|
|
7925
8421
|
for (int il = 0; il < n_layer; ++il) {
|
7926
|
-
|
8422
|
+
|
7927
8423
|
|
7928
8424
|
// norm
|
7929
8425
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7932,6 +8428,8 @@ struct llm_build_context {
|
|
7932
8428
|
LLM_NORM, cb, il);
|
7933
8429
|
cb(cur, "attn_norm", il);
|
7934
8430
|
|
8431
|
+
struct ggml_tensor * inpSA = cur;
|
8432
|
+
|
7935
8433
|
// self-attention
|
7936
8434
|
{
|
7937
8435
|
// compute Q and K and RoPE them
|
@@ -7956,15 +8454,36 @@ struct llm_build_context {
|
|
7956
8454
|
cb(Vcur, "Vcur", il);
|
7957
8455
|
}
|
7958
8456
|
|
8457
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8458
|
+
cb(Qcur, "Qcur", il);
|
8459
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8460
|
+
cb(Kcur, "Kcur", il);
|
8461
|
+
|
8462
|
+
if (model.layers[il].attn_q_norm) {
|
8463
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8464
|
+
model.layers[il].attn_q_norm,
|
8465
|
+
NULL,
|
8466
|
+
LLM_NORM, cb, il);
|
8467
|
+
cb(Qcur, "Qcur", il);
|
8468
|
+
}
|
8469
|
+
if (model.layers[il].attn_k_norm) {
|
8470
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8471
|
+
model.layers[il].attn_k_norm,
|
8472
|
+
NULL,
|
8473
|
+
LLM_NORM, cb, il);
|
8474
|
+
cb(Kcur, "Kcur", il);
|
8475
|
+
}
|
8476
|
+
|
8477
|
+
|
7959
8478
|
Qcur = ggml_rope_custom(
|
7960
|
-
ctx0,
|
8479
|
+
ctx0, Qcur, inp_pos,
|
7961
8480
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7962
8481
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7963
8482
|
);
|
7964
8483
|
cb(Qcur, "Qcur", il);
|
7965
8484
|
|
7966
8485
|
Kcur = ggml_rope_custom(
|
7967
|
-
ctx0,
|
8486
|
+
ctx0, Kcur, inp_pos,
|
7968
8487
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7969
8488
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7970
8489
|
);
|
@@ -7979,20 +8498,25 @@ struct llm_build_context {
|
|
7979
8498
|
// skip computing output for unused tokens
|
7980
8499
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7981
8500
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8501
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7982
8502
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7983
8503
|
}
|
7984
8504
|
|
7985
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur,
|
8505
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7986
8506
|
cb(ffn_inp, "ffn_inp", il);
|
7987
8507
|
|
7988
8508
|
// feed-forward network
|
7989
8509
|
{
|
7990
|
-
|
7991
|
-
|
7992
|
-
|
7993
|
-
|
7994
|
-
|
7995
|
-
|
8510
|
+
if (model.layers[il].ffn_norm) {
|
8511
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8512
|
+
model.layers[il].ffn_norm,
|
8513
|
+
model.layers[il].ffn_norm_b,
|
8514
|
+
LLM_NORM, cb, il);
|
8515
|
+
cb(cur, "ffn_norm", il);
|
8516
|
+
} else {
|
8517
|
+
// parallel residual
|
8518
|
+
cur = inpSA;
|
8519
|
+
}
|
7996
8520
|
cur = llm_build_ffn(ctx0, cur,
|
7997
8521
|
model.layers[il].ffn_up, NULL,
|
7998
8522
|
model.layers[il].ffn_gate, NULL,
|
@@ -8182,12 +8706,6 @@ struct llm_build_context {
|
|
8182
8706
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8183
8707
|
cb(Vcur, "Vcur", il);
|
8184
8708
|
|
8185
|
-
// these nodes are added to the graph together so that they are not reordered
|
8186
|
-
// by doing so, the number of splits in the graph is reduced
|
8187
|
-
ggml_build_forward_expand(gf, Qcur);
|
8188
|
-
ggml_build_forward_expand(gf, Kcur);
|
8189
|
-
ggml_build_forward_expand(gf, Vcur);
|
8190
|
-
|
8191
8709
|
Qcur = ggml_rope_custom(
|
8192
8710
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8193
8711
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8245,25 +8763,288 @@ struct llm_build_context {
|
|
8245
8763
|
LLM_NORM_RMS, cb, -1);
|
8246
8764
|
cb(cur, "result_norm", -1);
|
8247
8765
|
|
8248
|
-
// lm_head
|
8766
|
+
// lm_head
|
8767
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8768
|
+
cb(cur, "result_output", -1);
|
8769
|
+
|
8770
|
+
ggml_build_forward_expand(gf, cur);
|
8771
|
+
|
8772
|
+
return gf;
|
8773
|
+
}
|
8774
|
+
|
8775
|
+
struct ggml_cgraph * build_qwen2moe() {
|
8776
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8777
|
+
|
8778
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8779
|
+
int32_t n_tokens = this->n_tokens;
|
8780
|
+
|
8781
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8782
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8783
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8784
|
+
|
8785
|
+
struct ggml_tensor * cur;
|
8786
|
+
struct ggml_tensor * inpL;
|
8787
|
+
|
8788
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8789
|
+
|
8790
|
+
// inp_pos - contains the positions
|
8791
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8792
|
+
|
8793
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8794
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8795
|
+
|
8796
|
+
for (int il = 0; il < n_layer; ++il) {
|
8797
|
+
struct ggml_tensor * inpSA = inpL;
|
8798
|
+
|
8799
|
+
// norm
|
8800
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8801
|
+
model.layers[il].attn_norm, NULL,
|
8802
|
+
LLM_NORM_RMS, cb, il);
|
8803
|
+
cb(cur, "attn_norm", il);
|
8804
|
+
|
8805
|
+
// self_attention
|
8806
|
+
{
|
8807
|
+
// compute Q and K and RoPE them
|
8808
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8809
|
+
cb(Qcur, "Qcur", il);
|
8810
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8811
|
+
cb(Qcur, "Qcur", il);
|
8812
|
+
|
8813
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8814
|
+
cb(Kcur, "Kcur", il);
|
8815
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8816
|
+
cb(Kcur, "Kcur", il);
|
8817
|
+
|
8818
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8819
|
+
cb(Vcur, "Vcur", il);
|
8820
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8821
|
+
cb(Vcur, "Vcur", il);
|
8822
|
+
|
8823
|
+
Qcur = ggml_rope_custom(
|
8824
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8825
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8826
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8827
|
+
);
|
8828
|
+
cb(Qcur, "Qcur", il);
|
8829
|
+
|
8830
|
+
Kcur = ggml_rope_custom(
|
8831
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8832
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8833
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8834
|
+
);
|
8835
|
+
cb(Kcur, "Kcur", il);
|
8836
|
+
|
8837
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8838
|
+
model.layers[il].wo, model.layers[il].bo,
|
8839
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8840
|
+
}
|
8841
|
+
|
8842
|
+
if (il == n_layer - 1) {
|
8843
|
+
// skip computing output for unused tokens
|
8844
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8845
|
+
n_tokens = n_outputs;
|
8846
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8847
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8848
|
+
}
|
8849
|
+
|
8850
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8851
|
+
cb(ffn_inp, "ffn_inp", il);
|
8852
|
+
|
8853
|
+
// MoE branch
|
8854
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8855
|
+
model.layers[il].ffn_norm, NULL,
|
8856
|
+
LLM_NORM_RMS, cb, il);
|
8857
|
+
cb(cur, "ffn_norm", il);
|
8858
|
+
|
8859
|
+
ggml_tensor * moe_out =
|
8860
|
+
llm_build_moe_ffn(ctx0, cur,
|
8861
|
+
model.layers[il].ffn_gate_inp,
|
8862
|
+
model.layers[il].ffn_up_exps,
|
8863
|
+
model.layers[il].ffn_gate_exps,
|
8864
|
+
model.layers[il].ffn_down_exps,
|
8865
|
+
n_expert, n_expert_used,
|
8866
|
+
LLM_FFN_SILU, false,
|
8867
|
+
cb, il);
|
8868
|
+
cb(cur, "ffn_moe_out", il);
|
8869
|
+
|
8870
|
+
// FFN shared expert
|
8871
|
+
{
|
8872
|
+
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
8873
|
+
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
8874
|
+
|
8875
|
+
// sigmoid
|
8876
|
+
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
8877
|
+
cb(cur_gate, "ffn_shexp_gate", il);
|
8878
|
+
|
8879
|
+
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
8880
|
+
model.layers[il].ffn_up_shexp, NULL,
|
8881
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
8882
|
+
model.layers[il].ffn_down_shexp, NULL,
|
8883
|
+
NULL,
|
8884
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
8885
|
+
cb(cur_ffn, "ffn_shexp", il);
|
8886
|
+
|
8887
|
+
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
8888
|
+
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
8889
|
+
|
8890
|
+
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
8891
|
+
cb(moe_out, "ffn_out", il);
|
8892
|
+
|
8893
|
+
cur = moe_out;
|
8894
|
+
}
|
8895
|
+
|
8896
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8897
|
+
cb(cur, "l_out", il);
|
8898
|
+
|
8899
|
+
// input for next layer
|
8900
|
+
inpL = cur;
|
8901
|
+
}
|
8902
|
+
|
8903
|
+
cur = inpL;
|
8904
|
+
|
8905
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8906
|
+
model.output_norm, NULL,
|
8907
|
+
LLM_NORM_RMS, cb, -1);
|
8908
|
+
cb(cur, "result_norm", -1);
|
8909
|
+
|
8910
|
+
// lm_head
|
8911
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8912
|
+
cb(cur, "result_output", -1);
|
8913
|
+
|
8914
|
+
ggml_build_forward_expand(gf, cur);
|
8915
|
+
|
8916
|
+
return gf;
|
8917
|
+
}
|
8918
|
+
|
8919
|
+
struct ggml_cgraph * build_phi2() {
|
8920
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8921
|
+
|
8922
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8923
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
8924
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8925
|
+
|
8926
|
+
struct ggml_tensor * cur;
|
8927
|
+
struct ggml_tensor * attn_norm_output;
|
8928
|
+
struct ggml_tensor * ffn_output;
|
8929
|
+
struct ggml_tensor * inpL;
|
8930
|
+
|
8931
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8932
|
+
|
8933
|
+
// inp_pos - contains the positions
|
8934
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8935
|
+
|
8936
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8937
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8938
|
+
|
8939
|
+
for (int il = 0; il < n_layer; ++il) {
|
8940
|
+
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
8941
|
+
model.layers[il].attn_norm,
|
8942
|
+
model.layers[il].attn_norm_b,
|
8943
|
+
LLM_NORM, cb, il);
|
8944
|
+
cb(attn_norm_output, "attn_norm", il);
|
8945
|
+
|
8946
|
+
// self-attention
|
8947
|
+
{
|
8948
|
+
struct ggml_tensor * Qcur = nullptr;
|
8949
|
+
struct ggml_tensor * Kcur = nullptr;
|
8950
|
+
struct ggml_tensor * Vcur = nullptr;
|
8951
|
+
|
8952
|
+
if (model.layers[il].wqkv) {
|
8953
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
8954
|
+
cb(cur, "wqkv", il);
|
8955
|
+
|
8956
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
8957
|
+
cb(cur, "bqkv", il);
|
8958
|
+
|
8959
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
8960
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
8961
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
8962
|
+
} else {
|
8963
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
8964
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
8965
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
8966
|
+
}
|
8967
|
+
|
8968
|
+
cb(Qcur, "Qcur", il);
|
8969
|
+
cb(Kcur, "Kcur", il);
|
8970
|
+
cb(Vcur, "Vcur", il);
|
8971
|
+
|
8972
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8973
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8974
|
+
|
8975
|
+
Qcur = ggml_rope_custom(
|
8976
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8977
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8978
|
+
);
|
8979
|
+
cb(Qcur, "Qcur", il);
|
8980
|
+
|
8981
|
+
// with phi2, we scale the Q to avoid precision issues
|
8982
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
8983
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
8984
|
+
cb(Qcur, "Qcur", il);
|
8985
|
+
|
8986
|
+
Kcur = ggml_rope_custom(
|
8987
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8988
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8989
|
+
);
|
8990
|
+
cb(Kcur, "Kcur", il);
|
8991
|
+
|
8992
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8993
|
+
model.layers[il].wo, model.layers[il].bo,
|
8994
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8995
|
+
}
|
8996
|
+
|
8997
|
+
if (il == n_layer - 1) {
|
8998
|
+
// skip computing output for unused tokens
|
8999
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9000
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9001
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9002
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
9003
|
+
}
|
9004
|
+
|
9005
|
+
// FF
|
9006
|
+
{
|
9007
|
+
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
9008
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
9009
|
+
NULL, NULL,
|
9010
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
9011
|
+
NULL,
|
9012
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
9013
|
+
cb(ffn_output, "ffn_out", il);
|
9014
|
+
}
|
9015
|
+
|
9016
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
9017
|
+
cb(cur, "l_out", il);
|
9018
|
+
|
9019
|
+
cur = ggml_add(ctx0, cur, inpL);
|
9020
|
+
cb(cur, "l_out", il);
|
9021
|
+
|
9022
|
+
inpL = cur;
|
9023
|
+
}
|
9024
|
+
|
9025
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9026
|
+
model.output_norm,
|
9027
|
+
model.output_norm_b,
|
9028
|
+
LLM_NORM, cb, -1);
|
9029
|
+
cb(cur, "result_norm", -1);
|
9030
|
+
|
8249
9031
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8250
|
-
cb(cur, "
|
9032
|
+
cb(cur, "result_output_no_bias", -1);
|
8251
9033
|
|
9034
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
9035
|
+
cb(cur, "result_output", -1);
|
8252
9036
|
ggml_build_forward_expand(gf, cur);
|
8253
|
-
|
8254
9037
|
return gf;
|
8255
9038
|
}
|
8256
9039
|
|
8257
|
-
struct ggml_cgraph *
|
9040
|
+
struct ggml_cgraph * build_phi3() {
|
8258
9041
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8259
9042
|
|
8260
9043
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8261
|
-
const int64_t n_embd_gqa
|
9044
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
8262
9045
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8263
9046
|
|
8264
9047
|
struct ggml_tensor * cur;
|
8265
|
-
struct ggml_tensor * attn_norm_output;
|
8266
|
-
struct ggml_tensor * ffn_output;
|
8267
9048
|
struct ggml_tensor * inpL;
|
8268
9049
|
|
8269
9050
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -8275,14 +9056,16 @@ struct llm_build_context {
|
|
8275
9056
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8276
9057
|
|
8277
9058
|
for (int il = 0; il < n_layer; ++il) {
|
8278
|
-
|
8279
|
-
model.layers[il].attn_norm,
|
8280
|
-
model.layers[il].attn_norm_b,
|
8281
|
-
LLM_NORM, cb, il);
|
8282
|
-
cb(attn_norm_output, "attn_norm", il);
|
9059
|
+
auto residual = inpL;
|
8283
9060
|
|
8284
9061
|
// self-attention
|
8285
9062
|
{
|
9063
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9064
|
+
model.layers[il].attn_norm,
|
9065
|
+
NULL,
|
9066
|
+
LLM_NORM_RMS, cb, il);
|
9067
|
+
cb(attn_norm_output, "attn_norm", il);
|
9068
|
+
|
8286
9069
|
struct ggml_tensor * Qcur = nullptr;
|
8287
9070
|
struct ggml_tensor * Kcur = nullptr;
|
8288
9071
|
struct ggml_tensor * Vcur = nullptr;
|
@@ -8291,13 +9074,11 @@ struct llm_build_context {
|
|
8291
9074
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
8292
9075
|
cb(cur, "wqkv", il);
|
8293
9076
|
|
8294
|
-
|
8295
|
-
|
8296
|
-
|
8297
|
-
|
8298
|
-
|
8299
|
-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
8300
|
-
} else {
|
9077
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
9078
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
9079
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
9080
|
+
}
|
9081
|
+
else {
|
8301
9082
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
8302
9083
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
8303
9084
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
@@ -8316,9 +9097,7 @@ struct llm_build_context {
|
|
8316
9097
|
);
|
8317
9098
|
cb(Qcur, "Qcur", il);
|
8318
9099
|
|
8319
|
-
|
8320
|
-
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
8321
|
-
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9100
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
8322
9101
|
cb(Qcur, "Qcur", il);
|
8323
9102
|
|
8324
9103
|
Kcur = ggml_rope_custom(
|
@@ -8328,48 +9107,58 @@ struct llm_build_context {
|
|
8328
9107
|
cb(Kcur, "Kcur", il);
|
8329
9108
|
|
8330
9109
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8331
|
-
|
8332
|
-
|
9110
|
+
model.layers[il].wo, NULL,
|
9111
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8333
9112
|
}
|
8334
9113
|
|
8335
9114
|
if (il == n_layer - 1) {
|
8336
9115
|
// skip computing output for unused tokens
|
8337
|
-
struct ggml_tensor
|
8338
|
-
cur
|
8339
|
-
|
8340
|
-
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
9116
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
9117
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9118
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8341
9119
|
}
|
8342
9120
|
|
9121
|
+
cur = ggml_add(ctx0, cur, residual);
|
9122
|
+
residual = cur;
|
9123
|
+
|
9124
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9125
|
+
model.layers[il].ffn_norm, NULL,
|
9126
|
+
LLM_NORM_RMS, cb, il);
|
9127
|
+
cb(cur, "ffn_norm", il);
|
9128
|
+
|
8343
9129
|
// FF
|
9130
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
9131
|
+
// TOOD: support into llm_build_ffn
|
8344
9132
|
{
|
8345
|
-
|
8346
|
-
|
8347
|
-
NULL, NULL,
|
8348
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8349
|
-
NULL,
|
8350
|
-
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8351
|
-
cb(ffn_output, "ffn_out", il);
|
8352
|
-
}
|
9133
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
9134
|
+
cb(up, "ffn_up", il);
|
8353
9135
|
|
8354
|
-
|
8355
|
-
|
9136
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
9137
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
8356
9138
|
|
8357
|
-
|
9139
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
9140
|
+
cb(y, "ffn_gate", il);
|
9141
|
+
|
9142
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
9143
|
+
cb(down, "ffn_down", il);
|
9144
|
+
|
9145
|
+
cur = down;
|
9146
|
+
cb(cur, "ffn_out", il);
|
9147
|
+
}
|
9148
|
+
|
9149
|
+
cur = ggml_add(ctx0, residual, cur);
|
8358
9150
|
cb(cur, "l_out", il);
|
8359
9151
|
|
8360
9152
|
inpL = cur;
|
8361
9153
|
}
|
8362
9154
|
|
8363
9155
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
8364
|
-
|
8365
|
-
|
8366
|
-
|
9156
|
+
model.output_norm,
|
9157
|
+
NULL,
|
9158
|
+
LLM_NORM_RMS, cb, -1);
|
8367
9159
|
cb(cur, "result_norm", -1);
|
8368
9160
|
|
8369
9161
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8370
|
-
cb(cur, "result_output_no_bias", -1);
|
8371
|
-
|
8372
|
-
cur = ggml_add(ctx0, cur, model.output_b);
|
8373
9162
|
cb(cur, "result_output", -1);
|
8374
9163
|
|
8375
9164
|
ggml_build_forward_expand(gf, cur);
|
@@ -8377,6 +9166,7 @@ struct llm_build_context {
|
|
8377
9166
|
return gf;
|
8378
9167
|
}
|
8379
9168
|
|
9169
|
+
|
8380
9170
|
struct ggml_cgraph * build_plamo() {
|
8381
9171
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
8382
9172
|
|
@@ -9588,6 +10378,139 @@ struct llm_build_context {
|
|
9588
10378
|
return gf;
|
9589
10379
|
|
9590
10380
|
}
|
10381
|
+
|
10382
|
+
// ref: https://allenai.org/olmo
|
10383
|
+
// based on the original build_llama() function, changes:
|
10384
|
+
// * non-parametric layer norm
|
10385
|
+
// * clamp qkv
|
10386
|
+
// * removed bias
|
10387
|
+
// * removed MoE
|
10388
|
+
struct ggml_cgraph * build_olmo() {
|
10389
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10390
|
+
|
10391
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10392
|
+
int32_t n_tokens = this->n_tokens;
|
10393
|
+
|
10394
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10395
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10396
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10397
|
+
|
10398
|
+
struct ggml_tensor * cur;
|
10399
|
+
struct ggml_tensor * inpL;
|
10400
|
+
|
10401
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10402
|
+
|
10403
|
+
// inp_pos - contains the positions
|
10404
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10405
|
+
|
10406
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10407
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10408
|
+
|
10409
|
+
for (int il = 0; il < n_layer; ++il) {
|
10410
|
+
struct ggml_tensor * inpSA = inpL;
|
10411
|
+
|
10412
|
+
// norm
|
10413
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10414
|
+
NULL, NULL,
|
10415
|
+
LLM_NORM, cb, il);
|
10416
|
+
cb(cur, "attn_norm", il);
|
10417
|
+
|
10418
|
+
// self-attention
|
10419
|
+
{
|
10420
|
+
// compute Q and K and RoPE them
|
10421
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10422
|
+
cb(Qcur, "Qcur", il);
|
10423
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10424
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10425
|
+
cb(Qcur, "Qcur", il);
|
10426
|
+
}
|
10427
|
+
|
10428
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10429
|
+
cb(Kcur, "Kcur", il);
|
10430
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10431
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10432
|
+
cb(Kcur, "Kcur", il);
|
10433
|
+
}
|
10434
|
+
|
10435
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10436
|
+
cb(Vcur, "Vcur", il);
|
10437
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10438
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10439
|
+
cb(Vcur, "Vcur", il);
|
10440
|
+
}
|
10441
|
+
|
10442
|
+
Qcur = ggml_rope_custom(
|
10443
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10444
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10445
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10446
|
+
);
|
10447
|
+
cb(Qcur, "Qcur", il);
|
10448
|
+
|
10449
|
+
Kcur = ggml_rope_custom(
|
10450
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10451
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10452
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10453
|
+
);
|
10454
|
+
cb(Kcur, "Kcur", il);
|
10455
|
+
|
10456
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10457
|
+
model.layers[il].wo, nullptr,
|
10458
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10459
|
+
}
|
10460
|
+
|
10461
|
+
if (il == n_layer - 1) {
|
10462
|
+
// skip computing output for unused tokens
|
10463
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10464
|
+
n_tokens = n_outputs;
|
10465
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10466
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10467
|
+
}
|
10468
|
+
|
10469
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10470
|
+
cb(ffn_inp, "ffn_inp", il);
|
10471
|
+
|
10472
|
+
// feed-forward network
|
10473
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10474
|
+
NULL, NULL,
|
10475
|
+
LLM_NORM, cb, il);
|
10476
|
+
cb(cur, "ffn_norm", il);
|
10477
|
+
|
10478
|
+
cur = llm_build_ffn(ctx0, cur,
|
10479
|
+
model.layers[il].ffn_up, NULL,
|
10480
|
+
model.layers[il].ffn_gate, NULL,
|
10481
|
+
model.layers[il].ffn_down, NULL,
|
10482
|
+
NULL,
|
10483
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10484
|
+
cb(cur, "ffn_out", il);
|
10485
|
+
|
10486
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10487
|
+
cb(cur, "ffn_out", il);
|
10488
|
+
|
10489
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10490
|
+
if (layer_dir != nullptr) {
|
10491
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10492
|
+
}
|
10493
|
+
cb(cur, "l_out", il);
|
10494
|
+
|
10495
|
+
// input for next layer
|
10496
|
+
inpL = cur;
|
10497
|
+
}
|
10498
|
+
|
10499
|
+
cur = inpL;
|
10500
|
+
|
10501
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10502
|
+
NULL, NULL,
|
10503
|
+
LLM_NORM, cb, -1);
|
10504
|
+
cb(cur, "result_norm", -1);
|
10505
|
+
|
10506
|
+
// lm_head
|
10507
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10508
|
+
cb(cur, "result_output", -1);
|
10509
|
+
|
10510
|
+
ggml_build_forward_expand(gf, cur);
|
10511
|
+
|
10512
|
+
return gf;
|
10513
|
+
}
|
9591
10514
|
};
|
9592
10515
|
|
9593
10516
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -9737,10 +10660,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9737
10660
|
{
|
9738
10661
|
result = llm.build_qwen2();
|
9739
10662
|
} break;
|
10663
|
+
case LLM_ARCH_QWEN2MOE:
|
10664
|
+
{
|
10665
|
+
result = llm.build_qwen2moe();
|
10666
|
+
} break;
|
9740
10667
|
case LLM_ARCH_PHI2:
|
9741
10668
|
{
|
9742
10669
|
result = llm.build_phi2();
|
9743
10670
|
} break;
|
10671
|
+
case LLM_ARCH_PHI3:
|
10672
|
+
{
|
10673
|
+
result = llm.build_phi3();
|
10674
|
+
} break;
|
9744
10675
|
case LLM_ARCH_PLAMO:
|
9745
10676
|
{
|
9746
10677
|
result = llm.build_plamo();
|
@@ -9785,6 +10716,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9785
10716
|
{
|
9786
10717
|
result = llm.build_command_r();
|
9787
10718
|
} break;
|
10719
|
+
case LLM_ARCH_DBRX:
|
10720
|
+
{
|
10721
|
+
result = llm.build_dbrx();
|
10722
|
+
} break;
|
10723
|
+
case LLM_ARCH_OLMO:
|
10724
|
+
{
|
10725
|
+
result = llm.build_olmo();
|
10726
|
+
} break;
|
9788
10727
|
default:
|
9789
10728
|
GGML_ASSERT(false);
|
9790
10729
|
}
|
@@ -12556,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
12556
13495
|
GGML_ASSERT(ctx);
|
12557
13496
|
const int64_t t_start_sample_us = ggml_time_us();
|
12558
13497
|
|
12559
|
-
bool
|
13498
|
+
bool allow_eog = false;
|
12560
13499
|
for (const auto & stack : grammar->stacks) {
|
12561
13500
|
if (stack.empty()) {
|
12562
|
-
|
13501
|
+
allow_eog = true;
|
12563
13502
|
break;
|
12564
13503
|
}
|
12565
13504
|
}
|
12566
13505
|
|
12567
|
-
const llama_token eos = llama_token_eos(&ctx->model);
|
12568
|
-
|
12569
13506
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
12570
13507
|
candidates_decoded.reserve(candidates->size);
|
12571
13508
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
@@ -12573,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
12573
13510
|
|
12574
13511
|
for (size_t i = 0; i < candidates->size; ++i) {
|
12575
13512
|
const llama_token id = candidates->data[i].id;
|
12576
|
-
const std::string piece = llama_token_to_piece(ctx, id);
|
12577
|
-
|
12578
|
-
|
13513
|
+
const std::string piece = llama_token_to_piece(ctx, id, false);
|
13514
|
+
|
13515
|
+
if (llama_token_is_eog(&ctx->model, id)) {
|
13516
|
+
if (!allow_eog) {
|
12579
13517
|
candidates->data[i].logit = -INFINITY;
|
12580
13518
|
}
|
12581
13519
|
} else if (piece.empty() || piece[0] == 0) {
|
@@ -12738,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
12738
13676
|
return result;
|
12739
13677
|
}
|
12740
13678
|
|
12741
|
-
llama_token
|
13679
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
12742
13680
|
GGML_ASSERT(ctx);
|
12743
13681
|
|
12744
13682
|
const int64_t t_start_sample_us = ggml_time_us();
|
@@ -12751,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
12751
13689
|
}
|
12752
13690
|
|
12753
13691
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
12754
|
-
auto & rng = ctx->rng;
|
12755
13692
|
int idx = dist(rng);
|
12756
13693
|
|
12757
13694
|
llama_token result = candidates->data[idx].id;
|
@@ -12761,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
12761
13698
|
return result;
|
12762
13699
|
}
|
12763
13700
|
|
13701
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
13702
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
13703
|
+
}
|
13704
|
+
|
12764
13705
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
12765
13706
|
const int64_t t_start_sample_us = ggml_time_us();
|
12766
13707
|
|
12767
|
-
if (
|
13708
|
+
if (llama_token_is_eog(&ctx->model, token)) {
|
12768
13709
|
for (const auto & stack : grammar->stacks) {
|
12769
13710
|
if (stack.empty()) {
|
12770
13711
|
return;
|
@@ -12773,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
12773
13714
|
GGML_ASSERT(false);
|
12774
13715
|
}
|
12775
13716
|
|
12776
|
-
const std::string piece = llama_token_to_piece(ctx, token);
|
13717
|
+
const std::string piece = llama_token_to_piece(ctx, token, false);
|
12777
13718
|
|
12778
13719
|
// Note terminating 0 in decoded string
|
12779
13720
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -12915,6 +13856,11 @@ struct llama_beam_search_data {
|
|
12915
13856
|
}
|
12916
13857
|
llama_logit_info logit_info(ctx);
|
12917
13858
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
13859
|
+
|
13860
|
+
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
13861
|
+
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
13862
|
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
13863
|
+
|
12918
13864
|
size_t i=0;
|
12919
13865
|
if (next_beams.size() < n_beams) {
|
12920
13866
|
for (; next_beams.size() < n_beams ; ++i) {
|
@@ -13535,6 +14481,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13535
14481
|
gguf_set_kv (ctx_out, ml.meta);
|
13536
14482
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
13537
14483
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
14484
|
+
// Remove split metadata
|
14485
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
14486
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
14487
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
13538
14488
|
|
13539
14489
|
if (params->kv_overrides) {
|
13540
14490
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
@@ -13587,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13587
14537
|
std::vector<no_init<uint8_t>> work;
|
13588
14538
|
std::vector<no_init<float>> f32_conv_buf;
|
13589
14539
|
|
14540
|
+
uint16_t n_split = 1;
|
14541
|
+
// Assume split index is continuous
|
14542
|
+
if (params->keep_split) {
|
14543
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
14544
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
14545
|
+
}
|
14546
|
+
}
|
14547
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
14548
|
+
ctx_outs[0] = ctx_out;
|
14549
|
+
|
13590
14550
|
// populate the original tensors so we get an initial meta data
|
13591
14551
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
13592
|
-
|
13593
|
-
|
14552
|
+
auto weight = ml.get_weight(i);
|
14553
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
14554
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14555
|
+
if (ctx_outs[i_split] == NULL) {
|
14556
|
+
ctx_outs[i_split] = gguf_init_empty();
|
14557
|
+
}
|
14558
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
13594
14559
|
}
|
13595
14560
|
|
13596
|
-
|
13597
|
-
|
13598
|
-
|
13599
|
-
|
14561
|
+
// Set split info if needed
|
14562
|
+
if (n_split > 1) {
|
14563
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
14564
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
14565
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
14566
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
14567
|
+
}
|
14568
|
+
}
|
13600
14569
|
|
13601
|
-
|
14570
|
+
int cur_split = -1;
|
14571
|
+
std::ofstream fout;
|
14572
|
+
auto close_ofstream = [&]() {
|
14573
|
+
// Write metadata and close file handler
|
14574
|
+
if (fout.is_open()) {
|
14575
|
+
fout.seekp(0);
|
14576
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
14577
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
14578
|
+
fout.write((const char *) data.data(), data.size());
|
14579
|
+
fout.close();
|
14580
|
+
}
|
14581
|
+
};
|
14582
|
+
auto new_ofstream = [&](int index) {
|
14583
|
+
cur_split = index;
|
14584
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
14585
|
+
std::string fname = fname_out;
|
14586
|
+
if (params->keep_split) {
|
14587
|
+
char split_path[PATH_MAX] = {0};
|
14588
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
14589
|
+
fname = std::string(split_path);
|
14590
|
+
}
|
13602
14591
|
|
13603
|
-
|
13604
|
-
|
14592
|
+
fout = std::ofstream(fname, std::ios::binary);
|
14593
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
14594
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
14595
|
+
// placeholder for the meta data
|
14596
|
+
::zeros(fout, meta_size);
|
14597
|
+
};
|
13605
14598
|
|
13606
14599
|
const auto tn = LLM_TN(model.arch);
|
13607
|
-
|
14600
|
+
new_ofstream(0);
|
13608
14601
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
13609
|
-
|
14602
|
+
auto weight = ml.get_weight(i);
|
14603
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14604
|
+
if (weight->idx != cur_split && params->keep_split) {
|
14605
|
+
close_ofstream();
|
14606
|
+
new_ofstream(weight->idx);
|
14607
|
+
}
|
13610
14608
|
|
13611
14609
|
const std::string name = ggml_get_name(tensor);
|
13612
14610
|
|
@@ -13761,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13761
14759
|
total_size_new += new_size;
|
13762
14760
|
|
13763
14761
|
// update the gguf meta data as we go
|
13764
|
-
gguf_set_tensor_type(
|
13765
|
-
gguf_set_tensor_data(
|
14762
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
14763
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
13766
14764
|
|
13767
14765
|
// write tensor data + padding
|
13768
14766
|
fout.write((const char *) new_data, new_size);
|
13769
14767
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
13770
14768
|
}
|
13771
|
-
|
13772
|
-
|
13773
|
-
|
13774
|
-
fout.seekp(0);
|
13775
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
13776
|
-
gguf_get_meta_data(ctx_out, data.data());
|
13777
|
-
fout.write((const char *) data.data(), data.size());
|
14769
|
+
close_ofstream();
|
14770
|
+
for (auto & c:ctx_outs) {
|
14771
|
+
gguf_free(c);
|
13778
14772
|
}
|
13779
14773
|
|
13780
|
-
fout.close();
|
13781
|
-
|
13782
|
-
gguf_free(ctx_out);
|
13783
|
-
|
13784
14774
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
13785
14775
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
13786
14776
|
|
@@ -14136,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
14136
15126
|
/*.quantize_output_tensor =*/ true,
|
14137
15127
|
/*.only_copy =*/ false,
|
14138
15128
|
/*.pure =*/ false,
|
15129
|
+
/*.keep_split =*/ false,
|
14139
15130
|
/*.imatrix =*/ nullptr,
|
14140
15131
|
/*.kv_overrides =*/ nullptr,
|
14141
15132
|
};
|
@@ -14629,18 +15620,22 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14629
15620
|
case LLM_ARCH_MINICPM:
|
14630
15621
|
case LLM_ARCH_XVERSE:
|
14631
15622
|
case LLM_ARCH_COMMAND_R:
|
15623
|
+
case LLM_ARCH_OLMO:
|
14632
15624
|
return LLAMA_ROPE_TYPE_NORM;
|
14633
15625
|
|
14634
15626
|
// the pairs of head values are offset by n_rot/2
|
14635
15627
|
case LLM_ARCH_FALCON:
|
14636
15628
|
case LLM_ARCH_GROK:
|
15629
|
+
case LLM_ARCH_DBRX:
|
14637
15630
|
case LLM_ARCH_PERSIMMON:
|
14638
15631
|
case LLM_ARCH_BERT:
|
14639
15632
|
case LLM_ARCH_NOMIC_BERT:
|
14640
15633
|
case LLM_ARCH_STABLELM:
|
14641
15634
|
case LLM_ARCH_QWEN:
|
14642
15635
|
case LLM_ARCH_QWEN2:
|
15636
|
+
case LLM_ARCH_QWEN2MOE:
|
14643
15637
|
case LLM_ARCH_PHI2:
|
15638
|
+
case LLM_ARCH_PHI3:
|
14644
15639
|
case LLM_ARCH_GEMMA:
|
14645
15640
|
case LLM_ARCH_STARCODER2:
|
14646
15641
|
return LLAMA_ROPE_TYPE_NEOX;
|
@@ -14654,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14654
15649
|
return LLAMA_ROPE_TYPE_NONE;
|
14655
15650
|
}
|
14656
15651
|
|
15652
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
15653
|
+
return ctx->cparams.pooling_type;
|
15654
|
+
}
|
15655
|
+
|
14657
15656
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
14658
15657
|
return model->hparams.n_vocab;
|
14659
15658
|
}
|
@@ -15132,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
|
|
15132
16131
|
*
|
15133
16132
|
*/
|
15134
16133
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
16134
|
+
llama_synchronize(ctx);
|
16135
|
+
|
15135
16136
|
// copy rng
|
15136
16137
|
{
|
15137
16138
|
std::ostringstream rng_ss;
|
@@ -15284,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
15284
16285
|
|
15285
16286
|
// Sets the state reading from the specified source address
|
15286
16287
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
16288
|
+
llama_synchronize(ctx);
|
16289
|
+
|
15287
16290
|
const uint8_t * inp = src;
|
15288
16291
|
|
15289
16292
|
// set rng
|
@@ -15320,6 +16323,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15320
16323
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15321
16324
|
ctx->output_ids[id] = i;
|
15322
16325
|
}
|
16326
|
+
|
16327
|
+
ctx->n_outputs = n_outputs;
|
15323
16328
|
}
|
15324
16329
|
}
|
15325
16330
|
|
@@ -15586,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
15586
16591
|
}
|
15587
16592
|
|
15588
16593
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16594
|
+
llama_synchronize(ctx);
|
16595
|
+
|
15589
16596
|
const auto & kv_self = ctx->kv_self;
|
15590
16597
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15591
16598
|
|
@@ -15703,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
15703
16710
|
}
|
15704
16711
|
|
15705
16712
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16713
|
+
llama_synchronize(ctx);
|
16714
|
+
|
15706
16715
|
auto & kv_self = ctx->kv_self;
|
15707
16716
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15708
16717
|
|
@@ -16154,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|
16154
17163
|
return model->vocab.id_to_token[token].type;
|
16155
17164
|
}
|
16156
17165
|
|
17166
|
+
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
17167
|
+
return token != -1 && (
|
17168
|
+
token == llama_token_eos(model) ||
|
17169
|
+
token == llama_token_eot(model)
|
17170
|
+
);
|
17171
|
+
}
|
17172
|
+
|
16157
17173
|
llama_token llama_token_bos(const struct llama_model * model) {
|
16158
17174
|
return model->vocab.special_bos_id;
|
16159
17175
|
}
|
@@ -16231,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
16231
17247
|
}
|
16232
17248
|
|
16233
17249
|
// does not write null-terminator to buf
|
16234
|
-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
17250
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
16235
17251
|
if (0 <= token && token < llama_n_vocab(model)) {
|
16236
17252
|
switch (llama_vocab_get_type(model->vocab)) {
|
16237
17253
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -16246,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16246
17262
|
}
|
16247
17263
|
memcpy(buf, result.c_str(), result.length());
|
16248
17264
|
return result.length();
|
16249
|
-
} else if (
|
17265
|
+
} else if (
|
17266
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17267
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16250
17268
|
std::string result = model->vocab.id_to_token[token].text;
|
16251
17269
|
if (length < (int) result.length()) {
|
16252
17270
|
return -(int) result.length();
|
@@ -16259,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16259
17277
|
}
|
16260
17278
|
memcpy(buf, "\xe2\x96\x85", 3);
|
16261
17279
|
return 3;
|
16262
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16263
|
-
;
|
16264
17280
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
16265
17281
|
if (length < 1) {
|
16266
17282
|
return -1;
|
@@ -16281,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16281
17297
|
}
|
16282
17298
|
memcpy(buf, result.c_str(), result.length());
|
16283
17299
|
return result.length();
|
16284
|
-
} else if (
|
17300
|
+
} else if (
|
17301
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17302
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16285
17303
|
std::string result = model->vocab.id_to_token[token].text;
|
16286
17304
|
if (length < (int) result.length()) {
|
16287
17305
|
return -(int) result.length();
|
16288
17306
|
}
|
16289
17307
|
memcpy(buf, result.c_str(), result.length());
|
16290
17308
|
return result.length();
|
16291
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16292
|
-
;
|
16293
17309
|
}
|
16294
17310
|
break;
|
16295
17311
|
}
|
@@ -16472,6 +17488,39 @@ static int32_t llama_chat_apply_template_internal(
|
|
16472
17488
|
if (add_ass) {
|
16473
17489
|
ss << "### Response:\n";
|
16474
17490
|
}
|
17491
|
+
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
17492
|
+
// CohereForAI/c4ai-command-r-plus
|
17493
|
+
for (auto message : chat) {
|
17494
|
+
std::string role(message->role);
|
17495
|
+
if (role == "system") {
|
17496
|
+
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17497
|
+
} else if (role == "user") {
|
17498
|
+
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17499
|
+
} else if (role == "assistant") {
|
17500
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17501
|
+
}
|
17502
|
+
}
|
17503
|
+
if (add_ass) {
|
17504
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17505
|
+
}
|
17506
|
+
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
17507
|
+
// Llama 3
|
17508
|
+
for (auto message : chat) {
|
17509
|
+
std::string role(message->role);
|
17510
|
+
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
|
17511
|
+
}
|
17512
|
+
if (add_ass) {
|
17513
|
+
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17514
|
+
}
|
17515
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17516
|
+
// Phi 3
|
17517
|
+
for (auto message : chat) {
|
17518
|
+
std::string role(message->role);
|
17519
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17520
|
+
}
|
17521
|
+
if (add_ass) {
|
17522
|
+
ss << "<|assistant|>\n";
|
17523
|
+
}
|
16475
17524
|
} else {
|
16476
17525
|
// template not supported
|
16477
17526
|
return -1;
|
@@ -16604,6 +17653,11 @@ const char * llama_print_system_info(void) {
|
|
16604
17653
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
16605
17654
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
16606
17655
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
|
+
#ifdef GGML_USE_LLAMAFILE
|
17657
|
+
s += "LAMMAFILE = 1 | ";
|
17658
|
+
#else
|
17659
|
+
s += "LAMMAFILE = 0 | ";
|
17660
|
+
#endif
|
16607
17661
|
|
16608
17662
|
return s.c_str();
|
16609
17663
|
}
|