cui-llama.rn 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +5 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
- package/android/src/main/jni.cpp +9 -9
- package/cpp/common.cpp +28 -44
- package/cpp/common.h +35 -14
- package/cpp/ggml-alloc.c +0 -1
- package/cpp/ggml-backend-impl.h +38 -20
- package/cpp/ggml-backend-reg.cpp +246 -92
- package/cpp/ggml-backend.h +1 -0
- package/cpp/ggml-common.h +42 -48
- package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
- package/cpp/ggml-cpu-aarch64.h +2 -26
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14122 -13971
- package/cpp/ggml-cpu.cpp +627 -715
- package/cpp/ggml-cpu.h +0 -17
- package/cpp/ggml-impl.h +22 -6
- package/cpp/ggml-metal.m +482 -24
- package/cpp/ggml-quants.c +0 -9
- package/cpp/ggml-threading.h +4 -2
- package/cpp/ggml.c +284 -178
- package/cpp/ggml.h +73 -25
- package/cpp/llama-grammar.cpp +15 -15
- package/cpp/llama-grammar.h +2 -5
- package/cpp/llama-sampling.cpp +35 -90
- package/cpp/llama-vocab.cpp +7 -2
- package/cpp/llama-vocab.h +1 -1
- package/cpp/llama.cpp +1782 -586
- package/cpp/llama.h +20 -19
- package/cpp/sampling.cpp +11 -16
- package/cpp/sgemm.cpp +265 -258
- package/cpp/sgemm.h +2 -2
- package/cpp/speculative.cpp +4 -0
- package/cpp/unicode.cpp +51 -51
- package/cpp/unicode.h +9 -10
- package/lib/commonjs/index.js +38 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +36 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +2 -3
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +36 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +3 -3
- package/src/index.ts +46 -2
- package/cpp/amx/amx.cpp +0 -196
- package/cpp/amx/amx.h +0 -20
- package/cpp/amx/common.h +0 -101
- package/cpp/amx/mmq.cpp +0 -2524
- package/cpp/amx/mmq.h +0 -16
- package/cpp/ggml-aarch64.c +0 -129
- package/cpp/ggml-aarch64.h +0 -19
package/cpp/llama.cpp
CHANGED
@@ -157,6 +157,7 @@ static std::string format(const char * fmt, ...) {
|
|
157
157
|
|
158
158
|
enum llm_arch {
|
159
159
|
LLM_ARCH_LLAMA,
|
160
|
+
LLM_ARCH_DECI,
|
160
161
|
LLM_ARCH_FALCON,
|
161
162
|
LLM_ARCH_BAICHUAN,
|
162
163
|
LLM_ARCH_GROK,
|
@@ -174,6 +175,7 @@ enum llm_arch {
|
|
174
175
|
LLM_ARCH_QWEN,
|
175
176
|
LLM_ARCH_QWEN2,
|
176
177
|
LLM_ARCH_QWEN2MOE,
|
178
|
+
LLM_ARCH_QWEN2VL,
|
177
179
|
LLM_ARCH_PHI2,
|
178
180
|
LLM_ARCH_PHI3,
|
179
181
|
LLM_ARCH_PLAMO,
|
@@ -194,6 +196,7 @@ enum llm_arch {
|
|
194
196
|
LLM_ARCH_OLMOE,
|
195
197
|
LLM_ARCH_OPENELM,
|
196
198
|
LLM_ARCH_ARCTIC,
|
199
|
+
LLM_ARCH_DEEPSEEK,
|
197
200
|
LLM_ARCH_DEEPSEEK2,
|
198
201
|
LLM_ARCH_CHATGLM,
|
199
202
|
LLM_ARCH_BITNET,
|
@@ -206,61 +209,66 @@ enum llm_arch {
|
|
206
209
|
LLM_ARCH_GRANITE,
|
207
210
|
LLM_ARCH_GRANITE_MOE,
|
208
211
|
LLM_ARCH_CHAMELEON,
|
212
|
+
LLM_ARCH_WAVTOKENIZER_DEC,
|
209
213
|
LLM_ARCH_UNKNOWN,
|
210
214
|
};
|
211
215
|
|
212
216
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
213
|
-
{ LLM_ARCH_LLAMA,
|
214
|
-
{
|
215
|
-
{
|
216
|
-
{
|
217
|
-
{
|
218
|
-
{
|
219
|
-
{
|
220
|
-
{
|
221
|
-
{
|
222
|
-
{
|
223
|
-
{
|
224
|
-
{
|
225
|
-
{
|
226
|
-
{
|
227
|
-
{
|
228
|
-
{
|
229
|
-
{
|
230
|
-
{
|
231
|
-
{
|
232
|
-
{
|
233
|
-
{
|
234
|
-
{
|
235
|
-
{
|
236
|
-
{
|
237
|
-
{
|
238
|
-
{
|
239
|
-
{
|
240
|
-
{
|
241
|
-
{
|
242
|
-
{
|
243
|
-
{
|
244
|
-
{
|
245
|
-
{
|
246
|
-
{
|
247
|
-
{
|
248
|
-
{
|
249
|
-
{
|
250
|
-
{
|
251
|
-
{
|
252
|
-
{
|
253
|
-
{
|
254
|
-
{
|
255
|
-
{
|
256
|
-
{
|
257
|
-
{
|
258
|
-
{
|
259
|
-
{
|
260
|
-
{
|
261
|
-
{
|
262
|
-
{
|
263
|
-
{
|
217
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
218
|
+
{ LLM_ARCH_DECI, "deci" },
|
219
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
220
|
+
{ LLM_ARCH_GROK, "grok" },
|
221
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
222
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
223
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
224
|
+
{ LLM_ARCH_MPT, "mpt" },
|
225
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
226
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
227
|
+
{ LLM_ARCH_REFACT, "refact" },
|
228
|
+
{ LLM_ARCH_BERT, "bert" },
|
229
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
230
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
231
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
232
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
233
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
234
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
235
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
236
|
+
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
237
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
238
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
239
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
240
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
241
|
+
{ LLM_ARCH_ORION, "orion" },
|
242
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
243
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
244
|
+
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
245
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
246
|
+
{ LLM_ARCH_GEMMA2, "gemma2" },
|
247
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
248
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
249
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
250
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
251
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
252
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
253
|
+
{ LLM_ARCH_OLMO2, "olmo2" },
|
254
|
+
{ LLM_ARCH_OLMOE, "olmoe" },
|
255
|
+
{ LLM_ARCH_OPENELM, "openelm" },
|
256
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
257
|
+
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
258
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
259
|
+
{ LLM_ARCH_CHATGLM, "chatglm" },
|
260
|
+
{ LLM_ARCH_BITNET, "bitnet" },
|
261
|
+
{ LLM_ARCH_T5, "t5" },
|
262
|
+
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
263
|
+
{ LLM_ARCH_JAIS, "jais" },
|
264
|
+
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
265
|
+
{ LLM_ARCH_EXAONE, "exaone" },
|
266
|
+
{ LLM_ARCH_RWKV6, "rwkv6" },
|
267
|
+
{ LLM_ARCH_GRANITE, "granite" },
|
268
|
+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
269
|
+
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
270
|
+
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
271
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
264
272
|
};
|
265
273
|
|
266
274
|
enum llm_kv {
|
@@ -280,6 +288,7 @@ enum llm_kv {
|
|
280
288
|
LLM_KV_VOCAB_SIZE,
|
281
289
|
LLM_KV_CONTEXT_LENGTH,
|
282
290
|
LLM_KV_EMBEDDING_LENGTH,
|
291
|
+
LLM_KV_FEATURES_LENGTH,
|
283
292
|
LLM_KV_BLOCK_COUNT,
|
284
293
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
285
294
|
LLM_KV_FEED_FORWARD_LENGTH,
|
@@ -311,6 +320,8 @@ enum llm_kv {
|
|
311
320
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
312
321
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
313
322
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
323
|
+
LLM_KV_ATTENTION_GROUPNORM_EPS,
|
324
|
+
LLM_KV_ATTENTION_GROUPNORM_GROUPS,
|
314
325
|
LLM_KV_ATTENTION_CAUSAL,
|
315
326
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
316
327
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
@@ -319,6 +330,7 @@ enum llm_kv {
|
|
319
330
|
LLM_KV_ATTENTION_SCALE,
|
320
331
|
|
321
332
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
333
|
+
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
322
334
|
LLM_KV_ROPE_FREQ_BASE,
|
323
335
|
LLM_KV_ROPE_SCALE_LINEAR,
|
324
336
|
LLM_KV_ROPE_SCALING_TYPE,
|
@@ -373,6 +385,12 @@ enum llm_kv {
|
|
373
385
|
LLM_KV_ADAPTER_TYPE,
|
374
386
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
375
387
|
|
388
|
+
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
389
|
+
LLM_KV_POSNET_BLOCK_COUNT,
|
390
|
+
|
391
|
+
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
|
392
|
+
LLM_KV_CONVNEXT_BLOCK_COUNT,
|
393
|
+
|
376
394
|
// deprecated:
|
377
395
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
378
396
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
@@ -396,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
396
414
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
397
415
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
398
416
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
417
|
+
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
399
418
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
400
419
|
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
401
420
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
@@ -427,6 +446,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
427
446
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
428
447
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
429
448
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
449
|
+
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
450
|
+
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
430
451
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
431
452
|
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
432
453
|
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
@@ -435,6 +456,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
435
456
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
436
457
|
|
437
458
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
459
|
+
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
438
460
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
439
461
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
440
462
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
@@ -456,6 +478,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
456
478
|
|
457
479
|
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
458
480
|
|
481
|
+
{ LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
|
482
|
+
{ LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
|
483
|
+
|
484
|
+
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
|
485
|
+
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
|
486
|
+
|
459
487
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
460
488
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
461
489
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
@@ -614,6 +642,22 @@ enum llm_tensor {
|
|
614
642
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
615
643
|
LLM_TENSOR_CLS,
|
616
644
|
LLM_TENSOR_CLS_OUT,
|
645
|
+
LLM_TENSOR_CONV1D,
|
646
|
+
LLM_TENSOR_CONVNEXT_DW,
|
647
|
+
LLM_TENSOR_CONVNEXT_NORM,
|
648
|
+
LLM_TENSOR_CONVNEXT_PW1,
|
649
|
+
LLM_TENSOR_CONVNEXT_PW2,
|
650
|
+
LLM_TENSOR_CONVNEXT_GAMMA,
|
651
|
+
LLM_TENSOR_POS_NET_CONV1,
|
652
|
+
LLM_TENSOR_POS_NET_CONV2,
|
653
|
+
LLM_TENSOR_POS_NET_NORM,
|
654
|
+
LLM_TENSOR_POS_NET_NORM1,
|
655
|
+
LLM_TENSOR_POS_NET_NORM2,
|
656
|
+
LLM_TENSOR_POS_NET_ATTN_NORM,
|
657
|
+
LLM_TENSOR_POS_NET_ATTN_Q,
|
658
|
+
LLM_TENSOR_POS_NET_ATTN_K,
|
659
|
+
LLM_TENSOR_POS_NET_ATTN_V,
|
660
|
+
LLM_TENSOR_POS_NET_ATTN_OUT,
|
617
661
|
};
|
618
662
|
|
619
663
|
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
@@ -643,6 +687,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
643
687
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
644
688
|
},
|
645
689
|
},
|
690
|
+
{
|
691
|
+
LLM_ARCH_DECI,
|
692
|
+
{
|
693
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
694
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
695
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
696
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
697
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
698
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
699
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
700
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
701
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
702
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
703
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
704
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
705
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
706
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
707
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
708
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
709
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
710
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
711
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
712
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
713
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
714
|
+
},
|
715
|
+
},
|
646
716
|
{
|
647
717
|
LLM_ARCH_BAICHUAN,
|
648
718
|
{
|
@@ -909,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
909
979
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
910
980
|
},
|
911
981
|
},
|
982
|
+
{
|
983
|
+
LLM_ARCH_QWEN2VL,
|
984
|
+
{
|
985
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
986
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
987
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
988
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
989
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
990
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
991
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
992
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
993
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
994
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
995
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
996
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
997
|
+
},
|
998
|
+
},
|
912
999
|
{
|
913
1000
|
LLM_ARCH_QWEN2MOE,
|
914
1001
|
{
|
@@ -1047,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1047
1134
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1048
1135
|
{ LLM_TENSOR_OUTPUT, "output" },
|
1049
1136
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1137
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
1138
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
1050
1139
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1051
1140
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1052
1141
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
@@ -1297,6 +1386,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1297
1386
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1298
1387
|
},
|
1299
1388
|
},
|
1389
|
+
{
|
1390
|
+
LLM_ARCH_DEEPSEEK,
|
1391
|
+
{
|
1392
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1393
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1394
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1395
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1396
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1397
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1398
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1399
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1400
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1401
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
1402
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1403
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1404
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1405
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1406
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1407
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1408
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1409
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1410
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1411
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1412
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1413
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1414
|
+
},
|
1415
|
+
},
|
1300
1416
|
{
|
1301
1417
|
LLM_ARCH_DEEPSEEK2,
|
1302
1418
|
{
|
@@ -1552,6 +1668,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1552
1668
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1553
1669
|
},
|
1554
1670
|
},
|
1671
|
+
{
|
1672
|
+
LLM_ARCH_WAVTOKENIZER_DEC,
|
1673
|
+
{
|
1674
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1675
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
1676
|
+
{ LLM_TENSOR_CONV1D, "conv1d" },
|
1677
|
+
{ LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
|
1678
|
+
{ LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
|
1679
|
+
{ LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
|
1680
|
+
{ LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
|
1681
|
+
{ LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
|
1682
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1683
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1684
|
+
{ LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
|
1685
|
+
{ LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
|
1686
|
+
{ LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
|
1687
|
+
{ LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
|
1688
|
+
{ LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
|
1689
|
+
{ LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
|
1690
|
+
{ LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
|
1691
|
+
{ LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
|
1692
|
+
{ LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
|
1693
|
+
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
1694
|
+
},
|
1695
|
+
},
|
1555
1696
|
{
|
1556
1697
|
LLM_ARCH_UNKNOWN,
|
1557
1698
|
{
|
@@ -1560,6 +1701,73 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1560
1701
|
},
|
1561
1702
|
};
|
1562
1703
|
|
1704
|
+
enum llm_chat_template {
|
1705
|
+
LLM_CHAT_TEMPLATE_CHATML,
|
1706
|
+
LLM_CHAT_TEMPLATE_LLAMA_2,
|
1707
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
|
1708
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
|
1709
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
|
1710
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V1,
|
1711
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
1712
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
1713
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
1714
|
+
LLM_CHAT_TEMPLATE_PHI_3,
|
1715
|
+
LLM_CHAT_TEMPLATE_FALCON_3,
|
1716
|
+
LLM_CHAT_TEMPLATE_ZEPHYR,
|
1717
|
+
LLM_CHAT_TEMPLATE_MONARCH,
|
1718
|
+
LLM_CHAT_TEMPLATE_GEMMA,
|
1719
|
+
LLM_CHAT_TEMPLATE_ORION,
|
1720
|
+
LLM_CHAT_TEMPLATE_OPENCHAT,
|
1721
|
+
LLM_CHAT_TEMPLATE_VICUNA,
|
1722
|
+
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
1723
|
+
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
1724
|
+
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
1725
|
+
LLM_CHAT_TEMPLATE_COMMAND_R,
|
1726
|
+
LLM_CHAT_TEMPLATE_LLAMA_3,
|
1727
|
+
LLM_CHAT_TEMPLATE_CHATGML_3,
|
1728
|
+
LLM_CHAT_TEMPLATE_CHATGML_4,
|
1729
|
+
LLM_CHAT_TEMPLATE_MINICPM,
|
1730
|
+
LLM_CHAT_TEMPLATE_EXAONE_3,
|
1731
|
+
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
1732
|
+
LLM_CHAT_TEMPLATE_GRANITE,
|
1733
|
+
LLM_CHAT_TEMPLATE_GIGACHAT,
|
1734
|
+
LLM_CHAT_TEMPLATE_MEGREZ,
|
1735
|
+
LLM_CHAT_TEMPLATE_UNKNOWN,
|
1736
|
+
};
|
1737
|
+
|
1738
|
+
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
1739
|
+
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
|
1740
|
+
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
|
1741
|
+
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
|
1742
|
+
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
|
1743
|
+
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
|
1744
|
+
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
|
1745
|
+
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
1746
|
+
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
1747
|
+
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
1748
|
+
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
1749
|
+
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
1750
|
+
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
1751
|
+
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
1752
|
+
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
1753
|
+
{ "orion", LLM_CHAT_TEMPLATE_ORION },
|
1754
|
+
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
|
1755
|
+
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
|
1756
|
+
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
1757
|
+
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
1758
|
+
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
1759
|
+
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
1760
|
+
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
1761
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
1762
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
1763
|
+
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
1764
|
+
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
1765
|
+
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
1766
|
+
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
1767
|
+
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
1768
|
+
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
1769
|
+
};
|
1770
|
+
|
1563
1771
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
1564
1772
|
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
1565
1773
|
if (kv.second == name) {
|
@@ -1633,9 +1841,10 @@ struct LLM_TN {
|
|
1633
1841
|
//
|
1634
1842
|
|
1635
1843
|
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
1636
|
-
{ LLAMA_ROPE_SCALING_TYPE_NONE,
|
1637
|
-
{ LLAMA_ROPE_SCALING_TYPE_LINEAR,
|
1638
|
-
{ LLAMA_ROPE_SCALING_TYPE_YARN,
|
1844
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
1845
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
1846
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
1847
|
+
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
1639
1848
|
};
|
1640
1849
|
|
1641
1850
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
@@ -1741,7 +1950,7 @@ private:
|
|
1741
1950
|
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
1742
1951
|
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
1743
1952
|
if (!bufLen) {
|
1744
|
-
ret = format("Win32 error code: %
|
1953
|
+
ret = format("Win32 error code: %lx", error_code);
|
1745
1954
|
} else {
|
1746
1955
|
ret = lpMsgBuf;
|
1747
1956
|
LocalFree(lpMsgBuf);
|
@@ -2079,7 +2288,7 @@ struct llama_mmap {
|
|
2079
2288
|
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
2080
2289
|
|
2081
2290
|
// may fail on pre-Windows 8 systems
|
2082
|
-
pPrefetchVirtualMemory =
|
2291
|
+
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
2083
2292
|
|
2084
2293
|
if (pPrefetchVirtualMemory) {
|
2085
2294
|
// advise the kernel to preload the mapped memory
|
@@ -2378,15 +2587,26 @@ static const size_t kiB = 1024;
|
|
2378
2587
|
static const size_t MiB = 1024*kiB;
|
2379
2588
|
static const size_t GiB = 1024*MiB;
|
2380
2589
|
|
2590
|
+
struct llama_hparams_posnet {
|
2591
|
+
uint32_t n_embd;
|
2592
|
+
uint32_t n_layer;
|
2593
|
+
};
|
2594
|
+
|
2595
|
+
struct llama_hparams_convnext {
|
2596
|
+
uint32_t n_embd;
|
2597
|
+
uint32_t n_layer;
|
2598
|
+
};
|
2599
|
+
|
2381
2600
|
struct llama_hparams {
|
2382
2601
|
bool vocab_only;
|
2383
2602
|
bool rope_finetuned;
|
2384
2603
|
bool use_par_res;
|
2385
2604
|
bool swin_norm;
|
2386
2605
|
|
2387
|
-
uint32_t n_vocab;
|
2606
|
+
uint32_t n_vocab = 0;
|
2388
2607
|
uint32_t n_ctx_train; // context size the model was trained on
|
2389
2608
|
uint32_t n_embd;
|
2609
|
+
uint32_t n_embd_features = 0;
|
2390
2610
|
uint32_t n_layer;
|
2391
2611
|
uint32_t n_rot;
|
2392
2612
|
uint32_t n_swa = 0; // sliding window attention (SWA)
|
@@ -2397,6 +2617,10 @@ struct llama_hparams {
|
|
2397
2617
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
2398
2618
|
uint32_t n_rel_attn_bkts = 0;
|
2399
2619
|
|
2620
|
+
// for WavTokenizer
|
2621
|
+
struct llama_hparams_posnet posnet;
|
2622
|
+
struct llama_hparams_convnext convnext;
|
2623
|
+
|
2400
2624
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
2401
2625
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
2402
2626
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
@@ -2411,6 +2635,9 @@ struct llama_hparams {
|
|
2411
2635
|
|
2412
2636
|
float f_norm_eps;
|
2413
2637
|
float f_norm_rms_eps;
|
2638
|
+
float f_norm_group_eps;
|
2639
|
+
|
2640
|
+
uint32_t n_norm_groups;
|
2414
2641
|
|
2415
2642
|
float f_attn_logit_softcapping = 50.0f;
|
2416
2643
|
float f_final_logit_softcapping = 30.0f;
|
@@ -2421,11 +2648,12 @@ struct llama_hparams {
|
|
2421
2648
|
uint32_t time_decay_extra_dim = 0;
|
2422
2649
|
uint32_t wkv_head_size = 0;
|
2423
2650
|
|
2424
|
-
float
|
2425
|
-
float
|
2426
|
-
float
|
2427
|
-
uint32_t
|
2428
|
-
float
|
2651
|
+
float rope_attn_factor = 1.0f;
|
2652
|
+
float rope_freq_base_train;
|
2653
|
+
float rope_freq_scale_train;
|
2654
|
+
uint32_t n_ctx_orig_yarn;
|
2655
|
+
float rope_yarn_log_mul;
|
2656
|
+
int rope_sections[4];
|
2429
2657
|
|
2430
2658
|
// for State Space Models
|
2431
2659
|
uint32_t ssm_d_conv = 0;
|
@@ -2455,63 +2683,6 @@ struct llama_hparams {
|
|
2455
2683
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
2456
2684
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
2457
2685
|
|
2458
|
-
bool operator!=(const llama_hparams & other) const {
|
2459
|
-
if (this->vocab_only != other.vocab_only) return true;
|
2460
|
-
if (this->n_vocab != other.n_vocab) return true;
|
2461
|
-
if (this->n_ctx_train != other.n_ctx_train) return true;
|
2462
|
-
if (this->n_embd != other.n_embd) return true;
|
2463
|
-
if (this->n_layer != other.n_layer) return true;
|
2464
|
-
if (this->n_rot != other.n_rot) return true;
|
2465
|
-
if (this->n_swa != other.n_swa) return true;
|
2466
|
-
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
2467
|
-
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
2468
|
-
if (this->n_expert != other.n_expert) return true;
|
2469
|
-
if (this->n_expert_used != other.n_expert_used) return true;
|
2470
|
-
|
2471
|
-
if (this->n_head_arr != other.n_head_arr) return true;
|
2472
|
-
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
2473
|
-
if (this->n_ff_arr != other.n_ff_arr) return true;
|
2474
|
-
|
2475
|
-
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
2476
|
-
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
2477
|
-
if (this->n_lora_q != other.n_lora_q) return true;
|
2478
|
-
if (this->n_lora_kv != other.n_lora_kv) return true;
|
2479
|
-
if (this->n_ff_exp != other.n_ff_exp) return true;
|
2480
|
-
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
2481
|
-
if (this->n_expert_shared != other.n_expert_shared) return true;
|
2482
|
-
|
2483
|
-
if (this->rope_finetuned != other.rope_finetuned) return true;
|
2484
|
-
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
2485
|
-
|
2486
|
-
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
2487
|
-
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
2488
|
-
if (this->ssm_d_state != other.ssm_d_state) return true;
|
2489
|
-
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
2490
|
-
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
2491
|
-
|
2492
|
-
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
2493
|
-
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
2494
|
-
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
2495
|
-
if (this->wkv_head_size != other.wkv_head_size) return true;
|
2496
|
-
|
2497
|
-
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
2498
|
-
|
2499
|
-
const float EPSILON = 1e-9f;
|
2500
|
-
|
2501
|
-
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
2502
|
-
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
2503
|
-
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
2504
|
-
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
2505
|
-
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
2506
|
-
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
2507
|
-
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
2508
|
-
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
|
2509
|
-
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
|
2510
|
-
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
|
2511
|
-
|
2512
|
-
return false;
|
2513
|
-
}
|
2514
|
-
|
2515
2686
|
uint32_t n_head(uint32_t il = 0) const {
|
2516
2687
|
if (il < n_layer) {
|
2517
2688
|
return n_head_arr[il];
|
@@ -2564,21 +2735,21 @@ struct llama_hparams {
|
|
2564
2735
|
if (wkv_head_size != 0) {
|
2565
2736
|
// for RWKV models
|
2566
2737
|
return 2 * n_embd;
|
2567
|
-
} else {
|
2568
|
-
// TODO: maybe support other convolution strides than 1
|
2569
|
-
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
2570
|
-
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
2571
2738
|
}
|
2739
|
+
|
2740
|
+
// TODO: maybe support other convolution strides than 1
|
2741
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
2742
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
2572
2743
|
}
|
2573
2744
|
|
2574
2745
|
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
2575
2746
|
if (wkv_head_size != 0) {
|
2576
2747
|
// corresponds to RWKV's wkv_states size
|
2577
2748
|
return n_embd * wkv_head_size;
|
2578
|
-
} else {
|
2579
|
-
// corresponds to Mamba's ssm_states size
|
2580
|
-
return ssm_d_state * ssm_d_inner;
|
2581
2749
|
}
|
2750
|
+
|
2751
|
+
// corresponds to Mamba's ssm_states size
|
2752
|
+
return ssm_d_state * ssm_d_inner;
|
2582
2753
|
}
|
2583
2754
|
};
|
2584
2755
|
|
@@ -2616,142 +2787,187 @@ struct llama_cparams {
|
|
2616
2787
|
void * cb_eval_user_data;
|
2617
2788
|
};
|
2618
2789
|
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2622
|
-
|
2623
|
-
|
2624
|
-
|
2790
|
+
struct llama_layer_posnet {
|
2791
|
+
// resnet
|
2792
|
+
struct lm_ggml_tensor * norm1 = nullptr;
|
2793
|
+
struct lm_ggml_tensor * norm1_b = nullptr;
|
2794
|
+
|
2795
|
+
struct lm_ggml_tensor * conv1 = nullptr;
|
2796
|
+
struct lm_ggml_tensor * conv1_b = nullptr;
|
2797
|
+
|
2798
|
+
struct lm_ggml_tensor * norm2 = nullptr;
|
2799
|
+
struct lm_ggml_tensor * norm2_b = nullptr;
|
2800
|
+
|
2801
|
+
struct lm_ggml_tensor * conv2 = nullptr;
|
2802
|
+
struct lm_ggml_tensor * conv2_b = nullptr;
|
2803
|
+
|
2804
|
+
// attention
|
2805
|
+
struct lm_ggml_tensor * attn_norm = nullptr;
|
2806
|
+
struct lm_ggml_tensor * attn_norm_b = nullptr;
|
2807
|
+
|
2808
|
+
struct lm_ggml_tensor * attn_q = nullptr;
|
2809
|
+
struct lm_ggml_tensor * attn_q_b = nullptr;
|
2810
|
+
|
2811
|
+
struct lm_ggml_tensor * attn_k = nullptr;
|
2812
|
+
struct lm_ggml_tensor * attn_k_b = nullptr;
|
2813
|
+
|
2814
|
+
struct lm_ggml_tensor * attn_v = nullptr;
|
2815
|
+
struct lm_ggml_tensor * attn_v_b = nullptr;
|
2816
|
+
|
2817
|
+
struct lm_ggml_tensor * attn_o = nullptr;
|
2818
|
+
struct lm_ggml_tensor * attn_o_b = nullptr;
|
2819
|
+
|
2820
|
+
// normalize
|
2821
|
+
struct lm_ggml_tensor * norm = nullptr;
|
2822
|
+
struct lm_ggml_tensor * norm_b = nullptr;
|
2823
|
+
};
|
2824
|
+
|
2825
|
+
struct llama_layer_convnext {
|
2826
|
+
struct lm_ggml_tensor * dw = nullptr;
|
2827
|
+
struct lm_ggml_tensor * dw_b = nullptr;
|
2625
2828
|
|
2829
|
+
struct lm_ggml_tensor * norm = nullptr;
|
2830
|
+
struct lm_ggml_tensor * norm_b = nullptr;
|
2831
|
+
|
2832
|
+
struct lm_ggml_tensor * pw1 = nullptr;
|
2833
|
+
struct lm_ggml_tensor * pw1_b = nullptr;
|
2834
|
+
|
2835
|
+
struct lm_ggml_tensor * pw2 = nullptr;
|
2836
|
+
struct lm_ggml_tensor * pw2_b = nullptr;
|
2837
|
+
|
2838
|
+
struct lm_ggml_tensor * gamma = nullptr;
|
2839
|
+
};
|
2840
|
+
|
2841
|
+
struct llama_layer {
|
2626
2842
|
// normalization
|
2627
|
-
struct lm_ggml_tensor * attn_norm;
|
2628
|
-
struct lm_ggml_tensor * attn_norm_b;
|
2629
|
-
struct lm_ggml_tensor * attn_norm_2;
|
2630
|
-
struct lm_ggml_tensor * attn_norm_2_b;
|
2631
|
-
struct lm_ggml_tensor * attn_q_norm;
|
2632
|
-
struct lm_ggml_tensor * attn_q_norm_b;
|
2633
|
-
struct lm_ggml_tensor * attn_k_norm;
|
2634
|
-
struct lm_ggml_tensor * attn_k_norm_b;
|
2635
|
-
struct lm_ggml_tensor * attn_out_norm;
|
2636
|
-
struct lm_ggml_tensor * attn_out_norm_b;
|
2637
|
-
struct lm_ggml_tensor * attn_q_a_norm;
|
2638
|
-
struct lm_ggml_tensor * attn_kv_a_norm;
|
2639
|
-
struct lm_ggml_tensor * attn_sub_norm;
|
2640
|
-
struct lm_ggml_tensor * attn_post_norm;
|
2641
|
-
struct lm_ggml_tensor * ffn_sub_norm;
|
2642
|
-
struct lm_ggml_tensor * attn_norm_cross;
|
2643
|
-
struct lm_ggml_tensor * attn_norm_enc;
|
2843
|
+
struct lm_ggml_tensor * attn_norm = nullptr;
|
2844
|
+
struct lm_ggml_tensor * attn_norm_b = nullptr;
|
2845
|
+
struct lm_ggml_tensor * attn_norm_2 = nullptr;
|
2846
|
+
struct lm_ggml_tensor * attn_norm_2_b = nullptr;
|
2847
|
+
struct lm_ggml_tensor * attn_q_norm = nullptr;
|
2848
|
+
struct lm_ggml_tensor * attn_q_norm_b = nullptr;
|
2849
|
+
struct lm_ggml_tensor * attn_k_norm = nullptr;
|
2850
|
+
struct lm_ggml_tensor * attn_k_norm_b = nullptr;
|
2851
|
+
struct lm_ggml_tensor * attn_out_norm = nullptr;
|
2852
|
+
struct lm_ggml_tensor * attn_out_norm_b = nullptr;
|
2853
|
+
struct lm_ggml_tensor * attn_q_a_norm = nullptr;
|
2854
|
+
struct lm_ggml_tensor * attn_kv_a_norm = nullptr;
|
2855
|
+
struct lm_ggml_tensor * attn_sub_norm = nullptr;
|
2856
|
+
struct lm_ggml_tensor * attn_post_norm = nullptr;
|
2857
|
+
struct lm_ggml_tensor * ffn_sub_norm = nullptr;
|
2858
|
+
struct lm_ggml_tensor * attn_norm_cross = nullptr;
|
2859
|
+
struct lm_ggml_tensor * attn_norm_enc = nullptr;
|
2644
2860
|
|
2645
2861
|
// attention
|
2646
|
-
struct lm_ggml_tensor * wq;
|
2647
|
-
struct lm_ggml_tensor * wk;
|
2648
|
-
struct lm_ggml_tensor * wv;
|
2649
|
-
struct lm_ggml_tensor * wo;
|
2650
|
-
struct lm_ggml_tensor * wqkv;
|
2651
|
-
struct lm_ggml_tensor * wq_a;
|
2652
|
-
struct lm_ggml_tensor * wq_b;
|
2653
|
-
struct lm_ggml_tensor * wkv_a_mqa;
|
2654
|
-
struct lm_ggml_tensor * wkv_b;
|
2655
|
-
struct lm_ggml_tensor * wq_cross;
|
2656
|
-
struct lm_ggml_tensor * wk_cross;
|
2657
|
-
struct lm_ggml_tensor * wv_cross;
|
2658
|
-
struct lm_ggml_tensor * wo_cross;
|
2659
|
-
struct lm_ggml_tensor * wq_enc;
|
2660
|
-
struct lm_ggml_tensor * wk_enc;
|
2661
|
-
struct lm_ggml_tensor * wv_enc;
|
2662
|
-
struct lm_ggml_tensor * wo_enc;
|
2862
|
+
struct lm_ggml_tensor * wq = nullptr;
|
2863
|
+
struct lm_ggml_tensor * wk = nullptr;
|
2864
|
+
struct lm_ggml_tensor * wv = nullptr;
|
2865
|
+
struct lm_ggml_tensor * wo = nullptr;
|
2866
|
+
struct lm_ggml_tensor * wqkv = nullptr;
|
2867
|
+
struct lm_ggml_tensor * wq_a = nullptr;
|
2868
|
+
struct lm_ggml_tensor * wq_b = nullptr;
|
2869
|
+
struct lm_ggml_tensor * wkv_a_mqa = nullptr;
|
2870
|
+
struct lm_ggml_tensor * wkv_b = nullptr;
|
2871
|
+
struct lm_ggml_tensor * wq_cross = nullptr;
|
2872
|
+
struct lm_ggml_tensor * wk_cross = nullptr;
|
2873
|
+
struct lm_ggml_tensor * wv_cross = nullptr;
|
2874
|
+
struct lm_ggml_tensor * wo_cross = nullptr;
|
2875
|
+
struct lm_ggml_tensor * wq_enc = nullptr;
|
2876
|
+
struct lm_ggml_tensor * wk_enc = nullptr;
|
2877
|
+
struct lm_ggml_tensor * wv_enc = nullptr;
|
2878
|
+
struct lm_ggml_tensor * wo_enc = nullptr;
|
2663
2879
|
|
2664
2880
|
// attention bias
|
2665
|
-
struct lm_ggml_tensor * bq;
|
2666
|
-
struct lm_ggml_tensor * bk;
|
2667
|
-
struct lm_ggml_tensor * bv;
|
2668
|
-
struct lm_ggml_tensor * bo;
|
2669
|
-
struct lm_ggml_tensor * bqkv;
|
2881
|
+
struct lm_ggml_tensor * bq = nullptr;
|
2882
|
+
struct lm_ggml_tensor * bk = nullptr;
|
2883
|
+
struct lm_ggml_tensor * bv = nullptr;
|
2884
|
+
struct lm_ggml_tensor * bo = nullptr;
|
2885
|
+
struct lm_ggml_tensor * bqkv = nullptr;
|
2670
2886
|
|
2671
2887
|
// relative position bias
|
2672
|
-
struct lm_ggml_tensor * attn_rel_b;
|
2673
|
-
struct lm_ggml_tensor * attn_rel_b_enc;
|
2674
|
-
struct lm_ggml_tensor * attn_rel_b_cross;
|
2888
|
+
struct lm_ggml_tensor * attn_rel_b = nullptr;
|
2889
|
+
struct lm_ggml_tensor * attn_rel_b_enc = nullptr;
|
2890
|
+
struct lm_ggml_tensor * attn_rel_b_cross = nullptr;
|
2675
2891
|
|
2676
2892
|
// normalization
|
2677
|
-
struct lm_ggml_tensor * ffn_norm;
|
2678
|
-
struct lm_ggml_tensor * ffn_norm_b;
|
2679
|
-
struct lm_ggml_tensor * ffn_post_norm;
|
2680
|
-
struct lm_ggml_tensor * layer_out_norm;
|
2681
|
-
struct lm_ggml_tensor * layer_out_norm_b;
|
2682
|
-
struct lm_ggml_tensor * ffn_norm_exps;
|
2683
|
-
struct lm_ggml_tensor * ffn_norm_enc;
|
2893
|
+
struct lm_ggml_tensor * ffn_norm = nullptr;
|
2894
|
+
struct lm_ggml_tensor * ffn_norm_b = nullptr;
|
2895
|
+
struct lm_ggml_tensor * ffn_post_norm = nullptr;
|
2896
|
+
struct lm_ggml_tensor * layer_out_norm = nullptr;
|
2897
|
+
struct lm_ggml_tensor * layer_out_norm_b = nullptr;
|
2898
|
+
struct lm_ggml_tensor * ffn_norm_exps = nullptr;
|
2899
|
+
struct lm_ggml_tensor * ffn_norm_enc = nullptr;
|
2684
2900
|
|
2685
2901
|
// ff
|
2686
|
-
struct lm_ggml_tensor * ffn_gate; // w1
|
2687
|
-
struct lm_ggml_tensor * ffn_down; // w2
|
2688
|
-
struct lm_ggml_tensor * ffn_up;
|
2689
|
-
struct lm_ggml_tensor * ffn_gate_enc;
|
2690
|
-
struct lm_ggml_tensor * ffn_down_enc;
|
2691
|
-
struct lm_ggml_tensor * ffn_up_enc;
|
2902
|
+
struct lm_ggml_tensor * ffn_gate = nullptr; // w1
|
2903
|
+
struct lm_ggml_tensor * ffn_down = nullptr; // w2
|
2904
|
+
struct lm_ggml_tensor * ffn_up = nullptr; // w3
|
2905
|
+
struct lm_ggml_tensor * ffn_gate_enc = nullptr;
|
2906
|
+
struct lm_ggml_tensor * ffn_down_enc = nullptr;
|
2907
|
+
struct lm_ggml_tensor * ffn_up_enc = nullptr;
|
2692
2908
|
|
2693
2909
|
// ff MoE
|
2694
|
-
struct lm_ggml_tensor * ffn_gate_inp;
|
2695
|
-
struct lm_ggml_tensor * ffn_gate_exps;
|
2696
|
-
struct lm_ggml_tensor * ffn_down_exps;
|
2697
|
-
struct lm_ggml_tensor * ffn_up_exps ;
|
2910
|
+
struct lm_ggml_tensor * ffn_gate_inp = nullptr;
|
2911
|
+
struct lm_ggml_tensor * ffn_gate_exps = nullptr;
|
2912
|
+
struct lm_ggml_tensor * ffn_down_exps = nullptr;
|
2913
|
+
struct lm_ggml_tensor * ffn_up_exps = nullptr;
|
2698
2914
|
|
2699
2915
|
// ff shared expert (shexp)
|
2700
|
-
struct lm_ggml_tensor * ffn_gate_inp_shexp;
|
2701
|
-
struct lm_ggml_tensor * ffn_gate_shexp;
|
2702
|
-
struct lm_ggml_tensor * ffn_down_shexp;
|
2703
|
-
struct lm_ggml_tensor * ffn_up_shexp;
|
2916
|
+
struct lm_ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
2917
|
+
struct lm_ggml_tensor * ffn_gate_shexp = nullptr;
|
2918
|
+
struct lm_ggml_tensor * ffn_down_shexp = nullptr;
|
2919
|
+
struct lm_ggml_tensor * ffn_up_shexp = nullptr;
|
2704
2920
|
|
2705
2921
|
// ff bias
|
2706
|
-
struct lm_ggml_tensor * ffn_gate_b;
|
2707
|
-
struct lm_ggml_tensor * ffn_down_b; // b2
|
2708
|
-
struct lm_ggml_tensor * ffn_up_b; // b3
|
2709
|
-
struct lm_ggml_tensor * ffn_act;
|
2922
|
+
struct lm_ggml_tensor * ffn_gate_b = nullptr;
|
2923
|
+
struct lm_ggml_tensor * ffn_down_b = nullptr; // b2
|
2924
|
+
struct lm_ggml_tensor * ffn_up_b = nullptr; // b3
|
2925
|
+
struct lm_ggml_tensor * ffn_act = nullptr;
|
2710
2926
|
|
2711
2927
|
// mamba proj
|
2712
|
-
struct lm_ggml_tensor * ssm_in;
|
2713
|
-
struct lm_ggml_tensor * ssm_x;
|
2714
|
-
struct lm_ggml_tensor * ssm_dt;
|
2715
|
-
struct lm_ggml_tensor * ssm_out;
|
2928
|
+
struct lm_ggml_tensor * ssm_in = nullptr;
|
2929
|
+
struct lm_ggml_tensor * ssm_x = nullptr;
|
2930
|
+
struct lm_ggml_tensor * ssm_dt = nullptr;
|
2931
|
+
struct lm_ggml_tensor * ssm_out = nullptr;
|
2716
2932
|
|
2717
2933
|
// mamba
|
2718
|
-
struct lm_ggml_tensor * ssm_conv1d;
|
2719
|
-
struct lm_ggml_tensor * ssm_a;
|
2720
|
-
struct lm_ggml_tensor * ssm_d;
|
2934
|
+
struct lm_ggml_tensor * ssm_conv1d = nullptr;
|
2935
|
+
struct lm_ggml_tensor * ssm_a = nullptr;
|
2936
|
+
struct lm_ggml_tensor * ssm_d = nullptr;
|
2721
2937
|
|
2722
2938
|
// mamba bias
|
2723
|
-
struct lm_ggml_tensor * ssm_conv1d_b;
|
2724
|
-
struct lm_ggml_tensor * ssm_dt_b;
|
2939
|
+
struct lm_ggml_tensor * ssm_conv1d_b = nullptr;
|
2940
|
+
struct lm_ggml_tensor * ssm_dt_b = nullptr;
|
2725
2941
|
|
2726
2942
|
// rwkv
|
2727
|
-
struct lm_ggml_tensor * time_mix_w1;
|
2728
|
-
struct lm_ggml_tensor * time_mix_w2;
|
2729
|
-
struct lm_ggml_tensor * time_mix_lerp_x;
|
2730
|
-
struct lm_ggml_tensor * time_mix_lerp_w;
|
2731
|
-
struct lm_ggml_tensor * time_mix_lerp_k;
|
2732
|
-
struct lm_ggml_tensor * time_mix_lerp_v;
|
2733
|
-
struct lm_ggml_tensor * time_mix_lerp_r;
|
2734
|
-
struct lm_ggml_tensor * time_mix_lerp_g;
|
2735
|
-
|
2736
|
-
struct lm_ggml_tensor * time_mix_first;
|
2737
|
-
struct lm_ggml_tensor * time_mix_decay;
|
2738
|
-
struct lm_ggml_tensor * time_mix_decay_w1;
|
2739
|
-
struct lm_ggml_tensor * time_mix_decay_w2;
|
2740
|
-
struct lm_ggml_tensor * time_mix_key;
|
2741
|
-
struct lm_ggml_tensor * time_mix_value;
|
2742
|
-
struct lm_ggml_tensor * time_mix_receptance;
|
2743
|
-
struct lm_ggml_tensor * time_mix_gate;
|
2744
|
-
|
2745
|
-
struct lm_ggml_tensor * time_mix_ln;
|
2746
|
-
struct lm_ggml_tensor * time_mix_ln_b;
|
2747
|
-
struct lm_ggml_tensor * time_mix_output;
|
2748
|
-
|
2749
|
-
struct lm_ggml_tensor * channel_mix_lerp_k;
|
2750
|
-
struct lm_ggml_tensor * channel_mix_lerp_r;
|
2751
|
-
|
2752
|
-
struct lm_ggml_tensor * channel_mix_key;
|
2753
|
-
struct lm_ggml_tensor * channel_mix_receptance;
|
2754
|
-
struct lm_ggml_tensor * channel_mix_value;
|
2943
|
+
struct lm_ggml_tensor * time_mix_w1 = nullptr;
|
2944
|
+
struct lm_ggml_tensor * time_mix_w2 = nullptr;
|
2945
|
+
struct lm_ggml_tensor * time_mix_lerp_x = nullptr;
|
2946
|
+
struct lm_ggml_tensor * time_mix_lerp_w = nullptr;
|
2947
|
+
struct lm_ggml_tensor * time_mix_lerp_k = nullptr;
|
2948
|
+
struct lm_ggml_tensor * time_mix_lerp_v = nullptr;
|
2949
|
+
struct lm_ggml_tensor * time_mix_lerp_r = nullptr;
|
2950
|
+
struct lm_ggml_tensor * time_mix_lerp_g = nullptr;
|
2951
|
+
|
2952
|
+
struct lm_ggml_tensor * time_mix_first = nullptr;
|
2953
|
+
struct lm_ggml_tensor * time_mix_decay = nullptr;
|
2954
|
+
struct lm_ggml_tensor * time_mix_decay_w1 = nullptr;
|
2955
|
+
struct lm_ggml_tensor * time_mix_decay_w2 = nullptr;
|
2956
|
+
struct lm_ggml_tensor * time_mix_key = nullptr;
|
2957
|
+
struct lm_ggml_tensor * time_mix_value = nullptr;
|
2958
|
+
struct lm_ggml_tensor * time_mix_receptance = nullptr;
|
2959
|
+
struct lm_ggml_tensor * time_mix_gate = nullptr;
|
2960
|
+
|
2961
|
+
struct lm_ggml_tensor * time_mix_ln = nullptr;
|
2962
|
+
struct lm_ggml_tensor * time_mix_ln_b = nullptr;
|
2963
|
+
struct lm_ggml_tensor * time_mix_output = nullptr;
|
2964
|
+
|
2965
|
+
struct lm_ggml_tensor * channel_mix_lerp_k = nullptr;
|
2966
|
+
struct lm_ggml_tensor * channel_mix_lerp_r = nullptr;
|
2967
|
+
|
2968
|
+
struct lm_ggml_tensor * channel_mix_key = nullptr;
|
2969
|
+
struct lm_ggml_tensor * channel_mix_receptance = nullptr;
|
2970
|
+
struct lm_ggml_tensor * channel_mix_value = nullptr;
|
2755
2971
|
|
2756
2972
|
// long rope factors
|
2757
2973
|
struct lm_ggml_tensor * rope_long = nullptr;
|
@@ -2759,13 +2975,17 @@ struct llama_layer {
|
|
2759
2975
|
struct lm_ggml_tensor * rope_freqs = nullptr;
|
2760
2976
|
|
2761
2977
|
// bitnet scale
|
2762
|
-
struct lm_ggml_tensor * wq_scale;
|
2763
|
-
struct lm_ggml_tensor * wk_scale;
|
2764
|
-
struct lm_ggml_tensor * wv_scale;
|
2765
|
-
struct lm_ggml_tensor * wo_scale;
|
2766
|
-
struct lm_ggml_tensor * ffn_gate_scale;
|
2767
|
-
struct lm_ggml_tensor * ffn_up_scale;
|
2768
|
-
struct lm_ggml_tensor * ffn_down_scale;
|
2978
|
+
struct lm_ggml_tensor * wq_scale = nullptr;
|
2979
|
+
struct lm_ggml_tensor * wk_scale = nullptr;
|
2980
|
+
struct lm_ggml_tensor * wv_scale = nullptr;
|
2981
|
+
struct lm_ggml_tensor * wo_scale = nullptr;
|
2982
|
+
struct lm_ggml_tensor * ffn_gate_scale = nullptr;
|
2983
|
+
struct lm_ggml_tensor * ffn_up_scale = nullptr;
|
2984
|
+
struct lm_ggml_tensor * ffn_down_scale = nullptr;
|
2985
|
+
|
2986
|
+
struct llama_layer_posnet posnet;
|
2987
|
+
|
2988
|
+
struct llama_layer_convnext convnext;
|
2769
2989
|
};
|
2770
2990
|
|
2771
2991
|
// very similar to llama_batch,
|
@@ -2896,6 +3116,9 @@ struct llama_model {
|
|
2896
3116
|
struct lm_ggml_tensor * cls_out = nullptr;
|
2897
3117
|
struct lm_ggml_tensor * cls_out_b = nullptr;
|
2898
3118
|
|
3119
|
+
struct lm_ggml_tensor * conv1d = nullptr;
|
3120
|
+
struct lm_ggml_tensor * conv1d_b = nullptr;
|
3121
|
+
|
2899
3122
|
std::vector<llama_layer> layers;
|
2900
3123
|
|
2901
3124
|
// gguf metadata
|
@@ -2980,6 +3203,7 @@ struct llama_sbatch {
|
|
2980
3203
|
// batch indices of the output
|
2981
3204
|
std::vector<size_t> out_ids;
|
2982
3205
|
std::vector<llama_sbatch_seq> seq;
|
3206
|
+
|
2983
3207
|
const llama_batch * batch = nullptr;
|
2984
3208
|
|
2985
3209
|
// buffers for the ubatch
|
@@ -3325,6 +3549,11 @@ struct llama_context {
|
|
3325
3549
|
// whether we are computing encoder output or decoder output
|
3326
3550
|
bool is_encoding = false;
|
3327
3551
|
|
3552
|
+
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
3553
|
+
// number of position id each token get, 1 for each token in most cases.
|
3554
|
+
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
3555
|
+
int n_pos_per_token = 1;
|
3556
|
+
|
3328
3557
|
// output of the encoder part of the encoder-decoder models
|
3329
3558
|
std::vector<float> embd_enc;
|
3330
3559
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
@@ -3395,6 +3624,17 @@ static int llama_get_device_count(const llama_model & model) {
|
|
3395
3624
|
return (int) model.devices.size();
|
3396
3625
|
}
|
3397
3626
|
|
3627
|
+
static struct lm_ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
|
3628
|
+
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
3629
|
+
[name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
|
3630
|
+
return it.first == name;
|
3631
|
+
});
|
3632
|
+
if (it == model->tensors_by_name.end()) {
|
3633
|
+
return nullptr;
|
3634
|
+
}
|
3635
|
+
return it->second;
|
3636
|
+
}
|
3637
|
+
|
3398
3638
|
template<typename F>
|
3399
3639
|
static bool buft_supported(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_dev_t dev, F & fn) {
|
3400
3640
|
lm_ggml_init_params params = {
|
@@ -3448,7 +3688,9 @@ static bool llama_kv_cache_init(
|
|
3448
3688
|
|
3449
3689
|
const struct llama_hparams & hparams = model.hparams;
|
3450
3690
|
|
3451
|
-
const
|
3691
|
+
const int32_t n_layer = hparams.n_layer;
|
3692
|
+
|
3693
|
+
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer);
|
3452
3694
|
|
3453
3695
|
cache.has_shift = false;
|
3454
3696
|
|
@@ -3489,10 +3731,12 @@ static bool llama_kv_cache_init(
|
|
3489
3731
|
cache.k_l.reserve(n_layer);
|
3490
3732
|
cache.v_l.reserve(n_layer);
|
3491
3733
|
|
3492
|
-
for (int i = 0; i <
|
3734
|
+
for (int i = 0; i < n_layer; i++) {
|
3493
3735
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
3494
3736
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
3495
3737
|
|
3738
|
+
LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
|
3739
|
+
|
3496
3740
|
lm_ggml_backend_buffer_type_t buft;
|
3497
3741
|
if (offload) {
|
3498
3742
|
auto * dev = model.dev_layer.at(i).dev;
|
@@ -4525,9 +4769,6 @@ struct llama_model_loader {
|
|
4525
4769
|
case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
4526
4770
|
case LM_GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
4527
4771
|
case LM_GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
4528
|
-
case LM_GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
4529
|
-
case LM_GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
4530
|
-
case LM_GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
4531
4772
|
default:
|
4532
4773
|
{
|
4533
4774
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
|
@@ -5291,9 +5532,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
5291
5532
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
5292
5533
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
5293
5534
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
5294
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
5295
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
5296
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
5297
5535
|
|
5298
5536
|
default: return "unknown, may not work";
|
5299
5537
|
}
|
@@ -5411,7 +5649,7 @@ static void llm_load_hparams(
|
|
5411
5649
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
5412
5650
|
|
5413
5651
|
// get hparams kv
|
5414
|
-
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
5652
|
+
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
5415
5653
|
|
5416
5654
|
// everything past this point is not vocab-related
|
5417
5655
|
if (hparams.vocab_only) {
|
@@ -5424,6 +5662,16 @@ static void llm_load_hparams(
|
|
5424
5662
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
5425
5663
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
5426
5664
|
|
5665
|
+
if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
5666
|
+
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
5667
|
+
|
5668
|
+
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
5669
|
+
ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
5670
|
+
|
5671
|
+
ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
5672
|
+
ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
5673
|
+
}
|
5674
|
+
|
5427
5675
|
LM_GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
5428
5676
|
LM_GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
5429
5677
|
if (hparams.n_expert > 0) {
|
@@ -5432,13 +5680,13 @@ static void llm_load_hparams(
|
|
5432
5680
|
LM_GGML_ASSERT(hparams.n_expert_used == 0);
|
5433
5681
|
}
|
5434
5682
|
|
5435
|
-
// zero-out the
|
5683
|
+
// zero-out the array hparams
|
5436
5684
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
5437
5685
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
5438
5686
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
5439
5687
|
|
5440
|
-
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
5441
|
-
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
5688
|
+
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
5689
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
5442
5690
|
|
5443
5691
|
// n_head_kv is optional, default to n_head
|
5444
5692
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
@@ -5487,7 +5735,7 @@ static void llm_load_hparams(
|
|
5487
5735
|
|
5488
5736
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
5489
5737
|
|
5490
|
-
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
5738
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
5491
5739
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
5492
5740
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
5493
5741
|
}
|
@@ -5527,11 +5775,24 @@ static void llm_load_hparams(
|
|
5527
5775
|
}
|
5528
5776
|
}
|
5529
5777
|
} break;
|
5778
|
+
case LLM_ARCH_DECI:
|
5779
|
+
{
|
5780
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5781
|
+
switch (hparams.n_layer) {
|
5782
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
5783
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
5784
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5785
|
+
}
|
5786
|
+
} break;
|
5530
5787
|
case LLM_ARCH_MINICPM:
|
5531
5788
|
{
|
5532
5789
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5790
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
5791
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
5792
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
5533
5793
|
|
5534
5794
|
switch (hparams.n_layer) {
|
5795
|
+
case 52: model.type = e_model::MODEL_1B; break;
|
5535
5796
|
case 40: model.type = e_model::MODEL_2B; break;
|
5536
5797
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5537
5798
|
}
|
@@ -5696,6 +5957,13 @@ static void llm_load_hparams(
|
|
5696
5957
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5697
5958
|
}
|
5698
5959
|
} break;
|
5960
|
+
case LLM_ARCH_QWEN2VL:
|
5961
|
+
{
|
5962
|
+
std::array<int, 4> section_dims;
|
5963
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
|
5964
|
+
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
|
5965
|
+
}
|
5966
|
+
// fall through
|
5699
5967
|
case LLM_ARCH_QWEN2:
|
5700
5968
|
{
|
5701
5969
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -6006,6 +6274,19 @@ static void llm_load_hparams(
|
|
6006
6274
|
model.type = e_model::MODEL_UNKNOWN;
|
6007
6275
|
}
|
6008
6276
|
} break;
|
6277
|
+
case LLM_ARCH_DEEPSEEK:
|
6278
|
+
{
|
6279
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
6280
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
6281
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
6282
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
6283
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
6284
|
+
|
6285
|
+
switch (hparams.n_layer) {
|
6286
|
+
case 28: model.type = e_model::MODEL_20B; break;
|
6287
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
6288
|
+
}
|
6289
|
+
} break;
|
6009
6290
|
case LLM_ARCH_DEEPSEEK2:
|
6010
6291
|
{
|
6011
6292
|
bool is_lite = (hparams.n_layer == 27);
|
@@ -6159,6 +6440,13 @@ static void llm_load_hparams(
|
|
6159
6440
|
default: model.type = e_model::MODEL_UNKNOWN;
|
6160
6441
|
}
|
6161
6442
|
} break;
|
6443
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
6444
|
+
{
|
6445
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
6446
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
6447
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
6448
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
6449
|
+
} break;
|
6162
6450
|
default: (void)0;
|
6163
6451
|
}
|
6164
6452
|
|
@@ -6188,7 +6476,7 @@ static void llm_load_vocab(
|
|
6188
6476
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
6189
6477
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
6190
6478
|
|
6191
|
-
if (tokenizer_model == "no_vocab") {
|
6479
|
+
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
|
6192
6480
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
6193
6481
|
|
6194
6482
|
// default special tokens
|
@@ -6326,7 +6614,8 @@ static void llm_load_vocab(
|
|
6326
6614
|
} else if (
|
6327
6615
|
tokenizer_pre == "llama3" ||
|
6328
6616
|
tokenizer_pre == "llama-v3" ||
|
6329
|
-
tokenizer_pre == "llama-bpe"
|
6617
|
+
tokenizer_pre == "llama-bpe"||
|
6618
|
+
tokenizer_pre == "falcon3") {
|
6330
6619
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
6331
6620
|
vocab.tokenizer_ignore_merges = true;
|
6332
6621
|
vocab.tokenizer_add_bos = true;
|
@@ -6352,10 +6641,12 @@ static void llm_load_vocab(
|
|
6352
6641
|
tokenizer_pre == "phi-2" ||
|
6353
6642
|
tokenizer_pre == "jina-es" ||
|
6354
6643
|
tokenizer_pre == "jina-de" ||
|
6644
|
+
tokenizer_pre == "gigachat" ||
|
6355
6645
|
tokenizer_pre == "jina-v1-en" ||
|
6356
6646
|
tokenizer_pre == "jina-v2-es" ||
|
6357
6647
|
tokenizer_pre == "jina-v2-de" ||
|
6358
|
-
tokenizer_pre == "jina-v2-code"
|
6648
|
+
tokenizer_pre == "jina-v2-code" ||
|
6649
|
+
tokenizer_pre == "roberta-bpe") {
|
6359
6650
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
6360
6651
|
} else if (
|
6361
6652
|
tokenizer_pre == "refact") {
|
@@ -6422,6 +6713,12 @@ static void llm_load_vocab(
|
|
6422
6713
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
6423
6714
|
vocab.tokenizer_add_bos = true;
|
6424
6715
|
vocab.tokenizer_clean_spaces = false;
|
6716
|
+
} else if (
|
6717
|
+
tokenizer_pre == "minerva-7b") {
|
6718
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
6719
|
+
} else if (
|
6720
|
+
tokenizer_pre == "megrez") {
|
6721
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
6425
6722
|
} else {
|
6426
6723
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
6427
6724
|
}
|
@@ -7000,6 +7297,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
7000
7297
|
|
7001
7298
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
7002
7299
|
|
7300
|
+
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
7301
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
7302
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
7303
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
7304
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
7305
|
+
}
|
7306
|
+
|
7003
7307
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
7004
7308
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
7005
7309
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
@@ -7015,7 +7319,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
7015
7319
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
7016
7320
|
}
|
7017
7321
|
|
7018
|
-
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
7322
|
+
if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
7019
7323
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
7020
7324
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
7021
7325
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
@@ -7156,6 +7460,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
|
7156
7460
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT_ID}},
|
7157
7461
|
// this tensor is loaded for T5, but never used
|
7158
7462
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_NONE}},
|
7463
|
+
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, LM_GGML_OP_IM2COL}},
|
7464
|
+
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7465
|
+
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7466
|
+
{LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7467
|
+
{LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
|
7468
|
+
{LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
|
7469
|
+
{LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7470
|
+
{LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7471
|
+
{LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7472
|
+
{LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7473
|
+
{LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7474
|
+
{LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
|
7475
|
+
{LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7476
|
+
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7477
|
+
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7478
|
+
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7159
7479
|
};
|
7160
7480
|
|
7161
7481
|
// checks if the weight tensor can be used with the specified buffer type and device
|
@@ -7260,6 +7580,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
|
|
7260
7580
|
lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
|
7261
7581
|
op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
7262
7582
|
} break;
|
7583
|
+
case LM_GGML_OP_IM2COL:
|
7584
|
+
{
|
7585
|
+
const int n_embd = hparams.n_embd;
|
7586
|
+
lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
7587
|
+
op_tensor = lm_ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, LM_GGML_TYPE_F16);
|
7588
|
+
} break;
|
7263
7589
|
default:
|
7264
7590
|
LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
|
7265
7591
|
}
|
@@ -7390,7 +7716,8 @@ static bool llm_load_tensors(
|
|
7390
7716
|
model.main_gpu = main_gpu;
|
7391
7717
|
model.n_gpu_layers = n_gpu_layers;
|
7392
7718
|
|
7393
|
-
const int n_layer
|
7719
|
+
const int n_layer = hparams.n_layer;
|
7720
|
+
|
7394
7721
|
bool use_mmap_buffer = true;
|
7395
7722
|
|
7396
7723
|
// build a list of buffer types for the CPU and GPU devices
|
@@ -7640,7 +7967,13 @@ static bool llm_load_tensors(
|
|
7640
7967
|
|
7641
7968
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
7642
7969
|
|
7643
|
-
|
7970
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
7971
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7972
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7973
|
+
}
|
7974
|
+
else {
|
7975
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7976
|
+
}
|
7644
7977
|
|
7645
7978
|
if (n_expert == 0) {
|
7646
7979
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
@@ -7659,6 +7992,68 @@ static bool llm_load_tensors(
|
|
7659
7992
|
}
|
7660
7993
|
}
|
7661
7994
|
} break;
|
7995
|
+
case LLM_ARCH_DECI:
|
7996
|
+
{
|
7997
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
7998
|
+
|
7999
|
+
// output
|
8000
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
8001
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8002
|
+
|
8003
|
+
// if output is NULL, init from the input tok embed
|
8004
|
+
if (model.output == NULL) {
|
8005
|
+
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
8006
|
+
}
|
8007
|
+
|
8008
|
+
for (int i = 0; i < n_layer; ++i) {
|
8009
|
+
auto & layer = model.layers[i];
|
8010
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
8011
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
8012
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
8013
|
+
const int64_t n_ff = hparams.n_ff(i);
|
8014
|
+
const int64_t n_head = hparams.n_head(i);
|
8015
|
+
const int64_t n_head_kv = hparams.n_head_kv(i);
|
8016
|
+
|
8017
|
+
if (n_head_kv == 0 && n_head > 0) {
|
8018
|
+
// linear attention for DeciLMCausalModel
|
8019
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
8020
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
8021
|
+
}
|
8022
|
+
else if (n_head_kv > 0) {
|
8023
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
8024
|
+
|
8025
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
8026
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
8027
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
8028
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
8029
|
+
}
|
8030
|
+
|
8031
|
+
// optional bias tensors
|
8032
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8033
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8034
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8035
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8036
|
+
|
8037
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
8038
|
+
|
8039
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
8040
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
8041
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
8042
|
+
}
|
8043
|
+
else {
|
8044
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
8045
|
+
}
|
8046
|
+
|
8047
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
8048
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
8049
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
8050
|
+
|
8051
|
+
// optional MLP bias
|
8052
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8053
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8054
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8055
|
+
}
|
8056
|
+
} break;
|
7662
8057
|
case LLM_ARCH_MINICPM3:
|
7663
8058
|
{
|
7664
8059
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
@@ -8107,6 +8502,7 @@ static bool llm_load_tensors(
|
|
8107
8502
|
}
|
8108
8503
|
} break;
|
8109
8504
|
case LLM_ARCH_QWEN2:
|
8505
|
+
case LLM_ARCH_QWEN2VL:
|
8110
8506
|
{
|
8111
8507
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
8112
8508
|
|
@@ -8767,15 +9163,8 @@ static bool llm_load_tensors(
|
|
8767
9163
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
8768
9164
|
}
|
8769
9165
|
} break;
|
8770
|
-
case
|
9166
|
+
case LLM_ARCH_DEEPSEEK:
|
8771
9167
|
{
|
8772
|
-
const bool is_lite = (hparams.n_layer == 27);
|
8773
|
-
|
8774
|
-
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
8775
|
-
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
8776
|
-
|
8777
|
-
const int64_t q_lora_rank = hparams.n_lora_q;
|
8778
|
-
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
8779
9168
|
|
8780
9169
|
const int64_t n_ff_exp = hparams.n_ff_exp;
|
8781
9170
|
const int64_t n_expert_shared = hparams.n_expert_shared;
|
@@ -8790,23 +9179,11 @@ static bool llm_load_tensors(
|
|
8790
9179
|
auto & layer = model.layers[i];
|
8791
9180
|
|
8792
9181
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
8793
|
-
if (!is_lite) {
|
8794
|
-
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
8795
|
-
}
|
8796
|
-
|
8797
|
-
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
8798
|
-
|
8799
|
-
if (!is_lite) {
|
8800
|
-
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
8801
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
|
8802
|
-
} else {
|
8803
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
8804
|
-
}
|
8805
|
-
|
8806
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
8807
|
-
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
8808
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
8809
9182
|
|
9183
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
9184
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
9185
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
9186
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
8810
9187
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
8811
9188
|
|
8812
9189
|
if (i < (int) hparams.n_layer_dense_lead) {
|
@@ -8835,12 +9212,80 @@ static bool llm_load_tensors(
|
|
8835
9212
|
}
|
8836
9213
|
}
|
8837
9214
|
} break;
|
8838
|
-
case
|
9215
|
+
case LLM_ARCH_DEEPSEEK2:
|
8839
9216
|
{
|
8840
|
-
|
9217
|
+
const bool is_lite = (hparams.n_layer == 27);
|
8841
9218
|
|
8842
|
-
|
8843
|
-
|
9219
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
9220
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
9221
|
+
|
9222
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
9223
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
9224
|
+
|
9225
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
9226
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
9227
|
+
|
9228
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
9229
|
+
|
9230
|
+
// output
|
9231
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
9232
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
9233
|
+
|
9234
|
+
for (int i = 0; i < n_layer; ++i) {
|
9235
|
+
auto & layer = model.layers[i];
|
9236
|
+
|
9237
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
9238
|
+
if (!is_lite) {
|
9239
|
+
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
9240
|
+
}
|
9241
|
+
|
9242
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
9243
|
+
|
9244
|
+
if (!is_lite) {
|
9245
|
+
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
9246
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
|
9247
|
+
} else {
|
9248
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
9249
|
+
}
|
9250
|
+
|
9251
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
9252
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
9253
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
9254
|
+
|
9255
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
9256
|
+
|
9257
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
9258
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
9259
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
9260
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
9261
|
+
} else {
|
9262
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
9263
|
+
|
9264
|
+
if (n_expert == 0) {
|
9265
|
+
throw std::runtime_error("n_expert must be > 0");
|
9266
|
+
}
|
9267
|
+
if (n_expert_used == 0) {
|
9268
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
9269
|
+
}
|
9270
|
+
|
9271
|
+
// MoE branch
|
9272
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
9273
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
9274
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
9275
|
+
|
9276
|
+
// Shared expert branch
|
9277
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
9278
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
9279
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
9280
|
+
}
|
9281
|
+
}
|
9282
|
+
} break;
|
9283
|
+
case LLM_ARCH_BITNET:
|
9284
|
+
{
|
9285
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
9286
|
+
|
9287
|
+
// output
|
9288
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
8844
9289
|
|
8845
9290
|
for (int i = 0; i < n_layer; ++i) {
|
8846
9291
|
auto & layer = model.layers[i];
|
@@ -9137,9 +9582,9 @@ static bool llm_load_tensors(
|
|
9137
9582
|
} break;
|
9138
9583
|
case LLM_ARCH_CHAMELEON:
|
9139
9584
|
{
|
9140
|
-
|
9585
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
9141
9586
|
|
9142
|
-
|
9587
|
+
// output
|
9143
9588
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
9144
9589
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
9145
9590
|
// if output is NULL, init from the input tok embed
|
@@ -9168,6 +9613,109 @@ static bool llm_load_tensors(
|
|
9168
9613
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
9169
9614
|
}
|
9170
9615
|
} break;
|
9616
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
9617
|
+
{
|
9618
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
|
9619
|
+
|
9620
|
+
model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
|
9621
|
+
model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
|
9622
|
+
|
9623
|
+
// posnet
|
9624
|
+
{
|
9625
|
+
const int64_t n_embd = hparams.posnet.n_embd;
|
9626
|
+
|
9627
|
+
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
9628
|
+
auto & layer = model.layers[i].posnet;
|
9629
|
+
|
9630
|
+
// posnet:
|
9631
|
+
//
|
9632
|
+
// - resnet
|
9633
|
+
// - resnet
|
9634
|
+
// - attn
|
9635
|
+
// - resnet
|
9636
|
+
// - resnet
|
9637
|
+
// - norm
|
9638
|
+
//
|
9639
|
+
switch (i) {
|
9640
|
+
case 0:
|
9641
|
+
case 1:
|
9642
|
+
case 3:
|
9643
|
+
case 4:
|
9644
|
+
{
|
9645
|
+
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
|
9646
|
+
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
|
9647
|
+
|
9648
|
+
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
|
9649
|
+
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
|
9650
|
+
|
9651
|
+
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
|
9652
|
+
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
|
9653
|
+
|
9654
|
+
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
|
9655
|
+
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
|
9656
|
+
} break;
|
9657
|
+
case 2:
|
9658
|
+
{
|
9659
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
9660
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
9661
|
+
|
9662
|
+
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
|
9663
|
+
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
|
9664
|
+
|
9665
|
+
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
|
9666
|
+
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
|
9667
|
+
|
9668
|
+
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
|
9669
|
+
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
|
9670
|
+
|
9671
|
+
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
|
9672
|
+
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
|
9673
|
+
} break;
|
9674
|
+
case 5:
|
9675
|
+
{
|
9676
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
9677
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
9678
|
+
} break;
|
9679
|
+
default: LM_GGML_ABORT("unknown posnet layer");
|
9680
|
+
};
|
9681
|
+
}
|
9682
|
+
}
|
9683
|
+
|
9684
|
+
LM_GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
|
9685
|
+
|
9686
|
+
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
|
9687
|
+
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
|
9688
|
+
|
9689
|
+
// convnext
|
9690
|
+
{
|
9691
|
+
const int64_t n_embd = hparams.convnext.n_embd;
|
9692
|
+
|
9693
|
+
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
9694
|
+
auto & layer = model.layers[i].convnext;
|
9695
|
+
|
9696
|
+
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
|
9697
|
+
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
|
9698
|
+
|
9699
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
|
9700
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
|
9701
|
+
|
9702
|
+
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
|
9703
|
+
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
9704
|
+
|
9705
|
+
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
|
9706
|
+
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
|
9707
|
+
|
9708
|
+
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
|
9709
|
+
}
|
9710
|
+
|
9711
|
+
// output
|
9712
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
9713
|
+
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
9714
|
+
}
|
9715
|
+
|
9716
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
9717
|
+
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
9718
|
+
} break;
|
9171
9719
|
default:
|
9172
9720
|
throw std::runtime_error("unknown architecture");
|
9173
9721
|
}
|
@@ -9387,6 +9935,7 @@ enum llm_ffn_gate_type {
|
|
9387
9935
|
enum llm_norm_type {
|
9388
9936
|
LLM_NORM,
|
9389
9937
|
LLM_NORM_RMS,
|
9938
|
+
LLM_NORM_GROUP,
|
9390
9939
|
};
|
9391
9940
|
|
9392
9941
|
static struct lm_ggml_tensor * llm_build_inp_embd(
|
@@ -9407,7 +9956,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
|
|
9407
9956
|
|
9408
9957
|
inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
9409
9958
|
} else {
|
9410
|
-
|
9959
|
+
lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens);
|
9411
9960
|
inpL = lctx.inp_embd;
|
9412
9961
|
lm_ggml_set_input(lctx.inp_embd);
|
9413
9962
|
}
|
@@ -9528,8 +10077,14 @@ static struct lm_ggml_tensor * llm_build_norm(
|
|
9528
10077
|
const llm_build_cb & cb,
|
9529
10078
|
int il) {
|
9530
10079
|
switch (type) {
|
9531
|
-
case LLM_NORM:
|
9532
|
-
case LLM_NORM_RMS:
|
10080
|
+
case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break;
|
10081
|
+
case LLM_NORM_RMS: cur = lm_ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
|
10082
|
+
case LLM_NORM_GROUP:
|
10083
|
+
{
|
10084
|
+
cur = lm_ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
|
10085
|
+
cur = lm_ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
|
10086
|
+
cur = lm_ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
|
10087
|
+
} break;
|
9533
10088
|
}
|
9534
10089
|
|
9535
10090
|
if (mw || mb) {
|
@@ -10868,6 +11423,167 @@ struct llm_build_context {
|
|
10868
11423
|
return gf;
|
10869
11424
|
}
|
10870
11425
|
|
11426
|
+
struct lm_ggml_cgraph * build_deci() {
|
11427
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
11428
|
+
|
11429
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11430
|
+
int32_t n_tokens = this->n_tokens;
|
11431
|
+
|
11432
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
11433
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
11434
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
11435
|
+
|
11436
|
+
struct lm_ggml_tensor * cur;
|
11437
|
+
struct lm_ggml_tensor * inpL;
|
11438
|
+
|
11439
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
11440
|
+
|
11441
|
+
// inp_pos - contains the positions
|
11442
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
11443
|
+
|
11444
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11445
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11446
|
+
|
11447
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
11448
|
+
for (int il = 0; il < n_layer; ++il) {
|
11449
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
11450
|
+
const int64_t n_head_kv = hparams.n_head_kv(il);
|
11451
|
+
const int64_t n_head = hparams.n_head(il);
|
11452
|
+
|
11453
|
+
if (n_head == 0) {
|
11454
|
+
// attention-free layer of Llama-3_1-Nemotron-51B
|
11455
|
+
cur = inpL;
|
11456
|
+
} else {
|
11457
|
+
// norm
|
11458
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11459
|
+
model.layers[il].attn_norm, NULL,
|
11460
|
+
LLM_NORM_RMS, cb, il);
|
11461
|
+
cb(cur, "attn_norm", il);
|
11462
|
+
}
|
11463
|
+
|
11464
|
+
if (n_head > 0 && n_head_kv == 0) {
|
11465
|
+
// "linear attention" of Llama-3_1-Nemotron-51B
|
11466
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
11467
|
+
cb(cur, "wo", il);
|
11468
|
+
} else if (n_head > 0) {
|
11469
|
+
// self-attention
|
11470
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
11471
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
11472
|
+
|
11473
|
+
// compute Q and K and RoPE them
|
11474
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11475
|
+
cb(Qcur, "Qcur", il);
|
11476
|
+
if (model.layers[il].bq) {
|
11477
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11478
|
+
cb(Qcur, "Qcur", il);
|
11479
|
+
}
|
11480
|
+
|
11481
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11482
|
+
cb(Kcur, "Kcur", il);
|
11483
|
+
if (model.layers[il].bk) {
|
11484
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11485
|
+
cb(Kcur, "Kcur", il);
|
11486
|
+
}
|
11487
|
+
|
11488
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11489
|
+
cb(Vcur, "Vcur", il);
|
11490
|
+
if (model.layers[il].bv) {
|
11491
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
11492
|
+
cb(Vcur, "Vcur", il);
|
11493
|
+
}
|
11494
|
+
|
11495
|
+
Qcur = lm_ggml_rope_ext(
|
11496
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
11497
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11498
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
11499
|
+
);
|
11500
|
+
cb(Qcur, "Qcur", il);
|
11501
|
+
|
11502
|
+
Kcur = lm_ggml_rope_ext(
|
11503
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
11504
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11505
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
11506
|
+
);
|
11507
|
+
cb(Kcur, "Kcur", il);
|
11508
|
+
|
11509
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11510
|
+
model.layers[il].wo, model.layers[il].bo,
|
11511
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11512
|
+
}
|
11513
|
+
|
11514
|
+
if (il == n_layer - 1) {
|
11515
|
+
// skip computing output for unused tokens
|
11516
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11517
|
+
n_tokens = n_outputs;
|
11518
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
11519
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11520
|
+
}
|
11521
|
+
|
11522
|
+
// For Granite architecture
|
11523
|
+
if (hparams.f_residual_scale) {
|
11524
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
11525
|
+
}
|
11526
|
+
|
11527
|
+
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
11528
|
+
struct lm_ggml_tensor * ffn_inp = cur;
|
11529
|
+
if (n_head > 0) {
|
11530
|
+
ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
11531
|
+
cb(ffn_inp, "ffn_inp", il);
|
11532
|
+
}
|
11533
|
+
|
11534
|
+
// feed-forward network
|
11535
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
11536
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11537
|
+
model.layers[il].ffn_norm, NULL,
|
11538
|
+
LLM_NORM_RMS, cb, il);
|
11539
|
+
cb(cur, "ffn_norm", il);
|
11540
|
+
|
11541
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11542
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
11543
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
11544
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
11545
|
+
NULL,
|
11546
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11547
|
+
cb(cur, "ffn_out", il);
|
11548
|
+
}
|
11549
|
+
|
11550
|
+
// For Granite architecture
|
11551
|
+
if (hparams.f_residual_scale) {
|
11552
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
11553
|
+
}
|
11554
|
+
|
11555
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
11556
|
+
cb(cur, "ffn_out", il);
|
11557
|
+
|
11558
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
11559
|
+
cb(cur, "l_out", il);
|
11560
|
+
|
11561
|
+
// input for next layer
|
11562
|
+
inpL = cur;
|
11563
|
+
}
|
11564
|
+
|
11565
|
+
cur = inpL;
|
11566
|
+
|
11567
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11568
|
+
model.output_norm, NULL,
|
11569
|
+
LLM_NORM_RMS, cb, -1);
|
11570
|
+
cb(cur, "result_norm", -1);
|
11571
|
+
|
11572
|
+
// lm_head
|
11573
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11574
|
+
|
11575
|
+
// For Granite architecture
|
11576
|
+
if (hparams.f_logit_scale) {
|
11577
|
+
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
11578
|
+
}
|
11579
|
+
|
11580
|
+
cb(cur, "result_output", -1);
|
11581
|
+
|
11582
|
+
lm_ggml_build_forward_expand(gf, cur);
|
11583
|
+
|
11584
|
+
return gf;
|
11585
|
+
}
|
11586
|
+
|
10871
11587
|
struct lm_ggml_cgraph * build_baichuan() {
|
10872
11588
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
10873
11589
|
|
@@ -12496,12 +13212,8 @@ struct llm_build_context {
|
|
12496
13212
|
return gf;
|
12497
13213
|
}
|
12498
13214
|
|
12499
|
-
struct lm_ggml_cgraph *
|
13215
|
+
struct lm_ggml_cgraph * build_qwen2vl() {
|
12500
13216
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
12501
|
-
|
12502
|
-
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
12503
|
-
int32_t n_tokens = this->n_tokens;
|
12504
|
-
|
12505
13217
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
12506
13218
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
12507
13219
|
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -12512,10 +13224,15 @@ struct llm_build_context {
|
|
12512
13224
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
12513
13225
|
|
12514
13226
|
// inp_pos - contains the positions
|
12515
|
-
|
13227
|
+
lctx.inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens * 4);
|
13228
|
+
cb(lctx.inp_pos, "inp_pos", -1);
|
13229
|
+
lm_ggml_set_input(lctx.inp_pos);
|
13230
|
+
struct lm_ggml_tensor * inp_pos = lctx.inp_pos;
|
12516
13231
|
|
12517
13232
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
12518
13233
|
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13234
|
+
int sections[4];
|
13235
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
12519
13236
|
|
12520
13237
|
for (int il = 0; il < n_layer; ++il) {
|
12521
13238
|
struct lm_ggml_tensor * inpSA = inpL;
|
@@ -12526,7 +13243,7 @@ struct llm_build_context {
|
|
12526
13243
|
LLM_NORM_RMS, cb, il);
|
12527
13244
|
cb(cur, "attn_norm", il);
|
12528
13245
|
|
12529
|
-
//
|
13246
|
+
// self-attention
|
12530
13247
|
{
|
12531
13248
|
// compute Q and K and RoPE them
|
12532
13249
|
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
@@ -12544,8 +13261,125 @@ struct llm_build_context {
|
|
12544
13261
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
12545
13262
|
cb(Vcur, "Vcur", il);
|
12546
13263
|
|
12547
|
-
Qcur =
|
12548
|
-
ctx0,
|
13264
|
+
Qcur = lm_ggml_rope_multi(
|
13265
|
+
ctx0,
|
13266
|
+
lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
13267
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13268
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13269
|
+
);
|
13270
|
+
cb(Qcur, "Qcur", il);
|
13271
|
+
|
13272
|
+
Kcur = lm_ggml_rope_multi(
|
13273
|
+
ctx0,
|
13274
|
+
lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
13275
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13276
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13277
|
+
);
|
13278
|
+
cb(Kcur, "Kcur", il);
|
13279
|
+
|
13280
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13281
|
+
model.layers[il].wo, model.layers[il].bo,
|
13282
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13283
|
+
}
|
13284
|
+
|
13285
|
+
if (il == n_layer - 1) {
|
13286
|
+
// skip computing output for unused tokens
|
13287
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13288
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13289
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13290
|
+
}
|
13291
|
+
|
13292
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13293
|
+
cb(ffn_inp, "ffn_inp", il);
|
13294
|
+
|
13295
|
+
// feed-forward network
|
13296
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13297
|
+
model.layers[il].ffn_norm, NULL,
|
13298
|
+
LLM_NORM_RMS, cb, il);
|
13299
|
+
cb(cur, "ffn_norm", il);
|
13300
|
+
|
13301
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13302
|
+
model.layers[il].ffn_up, NULL, NULL,
|
13303
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
13304
|
+
model.layers[il].ffn_down, NULL, NULL,
|
13305
|
+
NULL,
|
13306
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
13307
|
+
cb(cur, "ffn_out", il);
|
13308
|
+
|
13309
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13310
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
13311
|
+
cb(cur, "l_out", il);
|
13312
|
+
|
13313
|
+
// input for next layer
|
13314
|
+
inpL = cur;
|
13315
|
+
}
|
13316
|
+
|
13317
|
+
cur = inpL;
|
13318
|
+
|
13319
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13320
|
+
model.output_norm, NULL,
|
13321
|
+
LLM_NORM_RMS, cb, -1);
|
13322
|
+
cb(cur, "result_norm", -1);
|
13323
|
+
|
13324
|
+
// lm_head
|
13325
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13326
|
+
cb(cur, "result_output", -1);
|
13327
|
+
|
13328
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13329
|
+
|
13330
|
+
return gf;
|
13331
|
+
}
|
13332
|
+
|
13333
|
+
struct lm_ggml_cgraph * build_qwen2moe() {
|
13334
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13335
|
+
|
13336
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
13337
|
+
int32_t n_tokens = this->n_tokens;
|
13338
|
+
|
13339
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13340
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13341
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13342
|
+
|
13343
|
+
struct lm_ggml_tensor * cur;
|
13344
|
+
struct lm_ggml_tensor * inpL;
|
13345
|
+
|
13346
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
13347
|
+
|
13348
|
+
// inp_pos - contains the positions
|
13349
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13350
|
+
|
13351
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13352
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13353
|
+
|
13354
|
+
for (int il = 0; il < n_layer; ++il) {
|
13355
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13356
|
+
|
13357
|
+
// norm
|
13358
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13359
|
+
model.layers[il].attn_norm, NULL,
|
13360
|
+
LLM_NORM_RMS, cb, il);
|
13361
|
+
cb(cur, "attn_norm", il);
|
13362
|
+
|
13363
|
+
// self_attention
|
13364
|
+
{
|
13365
|
+
// compute Q and K and RoPE them
|
13366
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13367
|
+
cb(Qcur, "Qcur", il);
|
13368
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
13369
|
+
cb(Qcur, "Qcur", il);
|
13370
|
+
|
13371
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13372
|
+
cb(Kcur, "Kcur", il);
|
13373
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
13374
|
+
cb(Kcur, "Kcur", il);
|
13375
|
+
|
13376
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13377
|
+
cb(Vcur, "Vcur", il);
|
13378
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
13379
|
+
cb(Vcur, "Vcur", il);
|
13380
|
+
|
13381
|
+
Qcur = lm_ggml_rope_ext(
|
13382
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
12549
13383
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12550
13384
|
ext_factor, attn_factor, beta_fast, beta_slow
|
12551
13385
|
);
|
@@ -12779,7 +13613,13 @@ struct llm_build_context {
|
|
12779
13613
|
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
12780
13614
|
|
12781
13615
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
12782
|
-
struct lm_ggml_tensor *
|
13616
|
+
struct lm_ggml_tensor * KQ_mask = nullptr;
|
13617
|
+
if (hparams.n_swa == 0) {
|
13618
|
+
// Phi-4 doesn't use sliding window attention
|
13619
|
+
KQ_mask = build_inp_KQ_mask();
|
13620
|
+
} else {
|
13621
|
+
KQ_mask = build_inp_KQ_mask_swa();
|
13622
|
+
}
|
12783
13623
|
|
12784
13624
|
for (int il = 0; il < n_layer; ++il) {
|
12785
13625
|
auto residual = inpL;
|
@@ -12837,7 +13677,7 @@ struct llm_build_context {
|
|
12837
13677
|
|
12838
13678
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12839
13679
|
model.layers[il].wo, model.layers[il].bo,
|
12840
|
-
Kcur, Vcur, Qcur,
|
13680
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
12841
13681
|
}
|
12842
13682
|
|
12843
13683
|
if (il == n_layer - 1) {
|
@@ -13447,153 +14287,6 @@ struct llm_build_context {
|
|
13447
14287
|
return gf;
|
13448
14288
|
}
|
13449
14289
|
|
13450
|
-
// ref: https://arxiv.org/abs/2203.03466
|
13451
|
-
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
13452
|
-
// based on the original build_llama() function
|
13453
|
-
struct lm_ggml_cgraph * build_minicpm() {
|
13454
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13455
|
-
|
13456
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13457
|
-
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13458
|
-
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13459
|
-
|
13460
|
-
const int64_t n_embd = hparams.n_embd;
|
13461
|
-
//TODO: if the model varies, these parameters need to be read from the model
|
13462
|
-
const int64_t n_embd_base = 256;
|
13463
|
-
const float scale_embd = 12.0f;
|
13464
|
-
const float scale_depth = 1.4f;
|
13465
|
-
|
13466
|
-
struct lm_ggml_tensor * cur;
|
13467
|
-
struct lm_ggml_tensor * inpL;
|
13468
|
-
|
13469
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
13470
|
-
|
13471
|
-
// scale the input embeddings
|
13472
|
-
inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
|
13473
|
-
cb(inpL, "inp_scaled", -1);
|
13474
|
-
|
13475
|
-
// inp_pos - contains the positions
|
13476
|
-
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13477
|
-
|
13478
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13479
|
-
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13480
|
-
|
13481
|
-
for (int il = 0; il < n_layer; ++il) {
|
13482
|
-
struct lm_ggml_tensor * inpSA = inpL;
|
13483
|
-
|
13484
|
-
// norm
|
13485
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
13486
|
-
model.layers[il].attn_norm, NULL,
|
13487
|
-
LLM_NORM_RMS, cb, il);
|
13488
|
-
cb(cur, "attn_norm", il);
|
13489
|
-
|
13490
|
-
// self-attention
|
13491
|
-
{
|
13492
|
-
// compute Q and K and RoPE them
|
13493
|
-
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13494
|
-
cb(Qcur, "Qcur", il);
|
13495
|
-
if (model.layers[il].bq) {
|
13496
|
-
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
13497
|
-
cb(Qcur, "Qcur", il);
|
13498
|
-
}
|
13499
|
-
|
13500
|
-
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13501
|
-
cb(Kcur, "Kcur", il);
|
13502
|
-
if (model.layers[il].bk) {
|
13503
|
-
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
13504
|
-
cb(Kcur, "Kcur", il);
|
13505
|
-
}
|
13506
|
-
|
13507
|
-
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13508
|
-
cb(Vcur, "Vcur", il);
|
13509
|
-
if (model.layers[il].bv) {
|
13510
|
-
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
13511
|
-
cb(Vcur, "Vcur", il);
|
13512
|
-
}
|
13513
|
-
|
13514
|
-
Qcur = lm_ggml_rope_ext(
|
13515
|
-
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
13516
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13517
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
13518
|
-
);
|
13519
|
-
cb(Qcur, "Qcur", il);
|
13520
|
-
|
13521
|
-
Kcur = lm_ggml_rope_ext(
|
13522
|
-
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
13523
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13524
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
13525
|
-
);
|
13526
|
-
cb(Kcur, "Kcur", il);
|
13527
|
-
|
13528
|
-
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13529
|
-
model.layers[il].wo, model.layers[il].bo,
|
13530
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13531
|
-
}
|
13532
|
-
|
13533
|
-
if (il == n_layer - 1) {
|
13534
|
-
// skip computing output for unused tokens
|
13535
|
-
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13536
|
-
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13537
|
-
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13538
|
-
}
|
13539
|
-
|
13540
|
-
// scale_res - scale the hidden states for residual connection
|
13541
|
-
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
13542
|
-
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13543
|
-
cb(cur, "hidden_scaled", -1);
|
13544
|
-
|
13545
|
-
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13546
|
-
cb(ffn_inp, "ffn_inp", il);
|
13547
|
-
|
13548
|
-
// feed-forward network
|
13549
|
-
{
|
13550
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13551
|
-
model.layers[il].ffn_norm, NULL,
|
13552
|
-
LLM_NORM_RMS, cb, il);
|
13553
|
-
cb(cur, "ffn_norm", il);
|
13554
|
-
|
13555
|
-
cur = llm_build_ffn(ctx0, lctx, cur,
|
13556
|
-
model.layers[il].ffn_up, NULL, NULL,
|
13557
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
13558
|
-
model.layers[il].ffn_down, NULL, NULL,
|
13559
|
-
NULL,
|
13560
|
-
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
13561
|
-
cb(cur, "ffn_out", il);
|
13562
|
-
}
|
13563
|
-
|
13564
|
-
// scale the hidden states for residual connection
|
13565
|
-
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13566
|
-
cb(cur, "hidden_scaled_ffn", -1);
|
13567
|
-
|
13568
|
-
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13569
|
-
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
13570
|
-
cb(cur, "l_out", il);
|
13571
|
-
|
13572
|
-
// input for next layer
|
13573
|
-
inpL = cur;
|
13574
|
-
}
|
13575
|
-
|
13576
|
-
cur = inpL;
|
13577
|
-
|
13578
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
13579
|
-
model.output_norm, NULL,
|
13580
|
-
LLM_NORM_RMS, cb, -1);
|
13581
|
-
cb(cur, "result_norm", -1);
|
13582
|
-
|
13583
|
-
// lm_head scaling
|
13584
|
-
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
13585
|
-
cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
|
13586
|
-
cb(cur, "lmhead_scaling", -1);
|
13587
|
-
|
13588
|
-
// lm_head
|
13589
|
-
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13590
|
-
cb(cur, "result_output", -1);
|
13591
|
-
|
13592
|
-
lm_ggml_build_forward_expand(gf, cur);
|
13593
|
-
|
13594
|
-
return gf;
|
13595
|
-
}
|
13596
|
-
|
13597
14290
|
struct lm_ggml_cgraph * build_minicpm3() {
|
13598
14291
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13599
14292
|
|
@@ -15061,22 +15754,169 @@ struct llm_build_context {
|
|
15061
15754
|
cb(Vcur, "Vcur", il);
|
15062
15755
|
|
15063
15756
|
Qcur = lm_ggml_rope_ext(
|
15064
|
-
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
15757
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
15758
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
15759
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
15760
|
+
);
|
15761
|
+
cb(Qcur, "Qcur", il);
|
15762
|
+
|
15763
|
+
Kcur = lm_ggml_rope_ext(
|
15764
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
15765
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
15766
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
15767
|
+
);
|
15768
|
+
cb(Kcur, "Kcur", il);
|
15769
|
+
|
15770
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
15771
|
+
model.layers[il].wo, NULL,
|
15772
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
15773
|
+
}
|
15774
|
+
|
15775
|
+
if (il == n_layer - 1) {
|
15776
|
+
// skip computing output for unused tokens
|
15777
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
15778
|
+
n_tokens = n_outputs;
|
15779
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
15780
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
15781
|
+
}
|
15782
|
+
|
15783
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
15784
|
+
cb(ffn_inp, "ffn_inp", il);
|
15785
|
+
|
15786
|
+
// feed-forward network
|
15787
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
15788
|
+
model.layers[il].ffn_norm, NULL,
|
15789
|
+
LLM_NORM_RMS, cb, il);
|
15790
|
+
cb(cur, "ffn_norm", il);
|
15791
|
+
|
15792
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
15793
|
+
model.layers[il].ffn_up, NULL, NULL,
|
15794
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
15795
|
+
model.layers[il].ffn_down, NULL, NULL,
|
15796
|
+
NULL,
|
15797
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
15798
|
+
cb(cur, "ffn_out", il);
|
15799
|
+
|
15800
|
+
struct lm_ggml_tensor * ffn_out = lm_ggml_add(ctx0, cur, ffn_inp);
|
15801
|
+
cb(ffn_out, "ffn_out", il);
|
15802
|
+
|
15803
|
+
// MoE
|
15804
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
15805
|
+
model.layers[il].ffn_norm_exps, NULL,
|
15806
|
+
LLM_NORM_RMS, cb, il);
|
15807
|
+
cb(cur, "ffn_norm_exps", il);
|
15808
|
+
|
15809
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
15810
|
+
model.layers[il].ffn_gate_inp,
|
15811
|
+
model.layers[il].ffn_up_exps,
|
15812
|
+
model.layers[il].ffn_gate_exps,
|
15813
|
+
model.layers[il].ffn_down_exps,
|
15814
|
+
n_expert, n_expert_used,
|
15815
|
+
LLM_FFN_SILU, true,
|
15816
|
+
false, 0.0,
|
15817
|
+
cb, il);
|
15818
|
+
cb(cur, "ffn_moe_out", il);
|
15819
|
+
|
15820
|
+
cur = lm_ggml_add(ctx0, cur, ffn_out);
|
15821
|
+
cb(cur, "ffn_out", il);
|
15822
|
+
|
15823
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
15824
|
+
cb(cur, "l_out", il);
|
15825
|
+
|
15826
|
+
// input for next layer
|
15827
|
+
inpL = cur;
|
15828
|
+
}
|
15829
|
+
|
15830
|
+
cur = inpL;
|
15831
|
+
|
15832
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
15833
|
+
model.output_norm, NULL,
|
15834
|
+
LLM_NORM_RMS, cb, -1);
|
15835
|
+
cb(cur, "result_norm", -1);
|
15836
|
+
|
15837
|
+
// lm_head
|
15838
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
15839
|
+
cb(cur, "result_output", -1);
|
15840
|
+
|
15841
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15842
|
+
|
15843
|
+
return gf;
|
15844
|
+
}
|
15845
|
+
|
15846
|
+
struct lm_ggml_cgraph * build_deepseek() {
|
15847
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15848
|
+
|
15849
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
15850
|
+
int32_t n_tokens = this->n_tokens;
|
15851
|
+
|
15852
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
15853
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
15854
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
15855
|
+
|
15856
|
+
struct lm_ggml_tensor * cur;
|
15857
|
+
struct lm_ggml_tensor * inpL;
|
15858
|
+
|
15859
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
15860
|
+
|
15861
|
+
// inp_pos - contains the positions
|
15862
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
15863
|
+
|
15864
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
15865
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
15866
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
15867
|
+
for (int il = 0; il < n_layer; ++il) {
|
15868
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
15869
|
+
|
15870
|
+
// norm
|
15871
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
15872
|
+
model.layers[il].attn_norm, NULL,
|
15873
|
+
LLM_NORM_RMS, cb, il);
|
15874
|
+
cb(cur, "attn_norm", il);
|
15875
|
+
|
15876
|
+
// self-attention
|
15877
|
+
{
|
15878
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
15879
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
15880
|
+
|
15881
|
+
// compute Q and K and RoPE them
|
15882
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
15883
|
+
cb(Qcur, "Qcur", il);
|
15884
|
+
if (model.layers[il].bq) {
|
15885
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
15886
|
+
cb(Qcur, "Qcur", il);
|
15887
|
+
}
|
15888
|
+
|
15889
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
15890
|
+
cb(Kcur, "Kcur", il);
|
15891
|
+
if (model.layers[il].bk) {
|
15892
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
15893
|
+
cb(Kcur, "Kcur", il);
|
15894
|
+
}
|
15895
|
+
|
15896
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
15897
|
+
cb(Vcur, "Vcur", il);
|
15898
|
+
if (model.layers[il].bv) {
|
15899
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
15900
|
+
cb(Vcur, "Vcur", il);
|
15901
|
+
}
|
15902
|
+
|
15903
|
+
Qcur = lm_ggml_rope_ext(
|
15904
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
15065
15905
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
15066
15906
|
ext_factor, attn_factor, beta_fast, beta_slow
|
15067
15907
|
);
|
15068
15908
|
cb(Qcur, "Qcur", il);
|
15069
15909
|
|
15070
15910
|
Kcur = lm_ggml_rope_ext(
|
15071
|
-
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
15911
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
15072
15912
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
15073
15913
|
ext_factor, attn_factor, beta_fast, beta_slow
|
15074
15914
|
);
|
15075
15915
|
cb(Kcur, "Kcur", il);
|
15076
15916
|
|
15077
15917
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
15078
|
-
model.layers[il].wo,
|
15079
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv,
|
15918
|
+
model.layers[il].wo, model.layers[il].bo,
|
15919
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
15080
15920
|
}
|
15081
15921
|
|
15082
15922
|
if (il == n_layer - 1) {
|
@@ -15087,46 +15927,53 @@ struct llm_build_context {
|
|
15087
15927
|
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
15088
15928
|
}
|
15089
15929
|
|
15930
|
+
|
15090
15931
|
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
15091
15932
|
cb(ffn_inp, "ffn_inp", il);
|
15092
15933
|
|
15093
|
-
// feed-forward network
|
15094
15934
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
15095
15935
|
model.layers[il].ffn_norm, NULL,
|
15096
15936
|
LLM_NORM_RMS, cb, il);
|
15097
15937
|
cb(cur, "ffn_norm", il);
|
15098
15938
|
|
15099
|
-
|
15100
|
-
|
15101
|
-
|
15102
|
-
|
15103
|
-
|
15104
|
-
|
15105
|
-
|
15106
|
-
|
15107
|
-
|
15108
|
-
|
15109
|
-
|
15110
|
-
|
15111
|
-
|
15112
|
-
|
15113
|
-
|
15114
|
-
|
15939
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
15940
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
15941
|
+
model.layers[il].ffn_up, NULL, NULL,
|
15942
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
15943
|
+
model.layers[il].ffn_down, NULL, NULL,
|
15944
|
+
NULL,
|
15945
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
15946
|
+
cb(cur, "ffn_out", il);
|
15947
|
+
} else {
|
15948
|
+
// MoE branch
|
15949
|
+
lm_ggml_tensor * moe_out =
|
15950
|
+
llm_build_moe_ffn(ctx0, lctx, cur,
|
15951
|
+
model.layers[il].ffn_gate_inp,
|
15952
|
+
model.layers[il].ffn_up_exps,
|
15953
|
+
model.layers[il].ffn_gate_exps,
|
15954
|
+
model.layers[il].ffn_down_exps,
|
15955
|
+
n_expert, n_expert_used,
|
15956
|
+
LLM_FFN_SILU, false,
|
15957
|
+
false, hparams.expert_weights_scale,
|
15958
|
+
cb, il);
|
15959
|
+
cb(moe_out, "ffn_moe_out", il);
|
15115
15960
|
|
15116
|
-
|
15117
|
-
|
15118
|
-
|
15119
|
-
|
15120
|
-
|
15121
|
-
|
15122
|
-
|
15123
|
-
|
15124
|
-
cb, il);
|
15125
|
-
cb(cur, "ffn_moe_out", il);
|
15961
|
+
// FFN shared expert
|
15962
|
+
{
|
15963
|
+
lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
15964
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
15965
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
15966
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
15967
|
+
NULL,
|
15968
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
15969
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
15126
15970
|
|
15127
|
-
|
15128
|
-
|
15971
|
+
cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
|
15972
|
+
cb(cur, "ffn_out", il);
|
15973
|
+
}
|
15974
|
+
}
|
15129
15975
|
|
15976
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
15130
15977
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
15131
15978
|
cb(cur, "l_out", il);
|
15132
15979
|
|
@@ -15143,6 +15990,7 @@ struct llm_build_context {
|
|
15143
15990
|
|
15144
15991
|
// lm_head
|
15145
15992
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
15993
|
+
|
15146
15994
|
cb(cur, "result_output", -1);
|
15147
15995
|
|
15148
15996
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -15529,7 +16377,7 @@ struct llm_build_context {
|
|
15529
16377
|
return gf;
|
15530
16378
|
}
|
15531
16379
|
|
15532
|
-
struct lm_ggml_cgraph *
|
16380
|
+
struct lm_ggml_cgraph * build_t5_enc() {
|
15533
16381
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15534
16382
|
|
15535
16383
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -15661,7 +16509,7 @@ struct llm_build_context {
|
|
15661
16509
|
return gf;
|
15662
16510
|
}
|
15663
16511
|
|
15664
|
-
struct lm_ggml_cgraph *
|
16512
|
+
struct lm_ggml_cgraph * build_t5_dec() {
|
15665
16513
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15666
16514
|
|
15667
16515
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -16610,6 +17458,158 @@ struct llm_build_context {
|
|
16610
17458
|
|
16611
17459
|
return gf;
|
16612
17460
|
}
|
17461
|
+
|
17462
|
+
struct lm_ggml_cgraph * build_wavtokenizer_dec() {
|
17463
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
17464
|
+
|
17465
|
+
struct lm_ggml_tensor * cur;
|
17466
|
+
struct lm_ggml_tensor * inpL;
|
17467
|
+
|
17468
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
17469
|
+
|
17470
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, inpL));
|
17471
|
+
|
17472
|
+
cur = lm_ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
17473
|
+
cur = lm_ggml_add(ctx0, cur, model.conv1d_b);
|
17474
|
+
|
17475
|
+
// posnet
|
17476
|
+
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
17477
|
+
const auto & layer = model.layers[il].posnet;
|
17478
|
+
|
17479
|
+
inpL = cur;
|
17480
|
+
|
17481
|
+
switch (il) {
|
17482
|
+
case 0:
|
17483
|
+
case 1:
|
17484
|
+
case 3:
|
17485
|
+
case 4:
|
17486
|
+
{
|
17487
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17488
|
+
layer.norm1,
|
17489
|
+
layer.norm1_b,
|
17490
|
+
LLM_NORM_GROUP, cb, 0);
|
17491
|
+
|
17492
|
+
cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
|
17493
|
+
|
17494
|
+
cur = lm_ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
17495
|
+
cur = lm_ggml_add(ctx0, cur, layer.conv1_b);
|
17496
|
+
|
17497
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17498
|
+
layer.norm2,
|
17499
|
+
layer.norm2_b,
|
17500
|
+
LLM_NORM_GROUP, cb, 0);
|
17501
|
+
|
17502
|
+
cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
|
17503
|
+
|
17504
|
+
cur = lm_ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
17505
|
+
cur = lm_ggml_add(ctx0, cur, layer.conv2_b);
|
17506
|
+
|
17507
|
+
cur = lm_ggml_add(ctx0, cur, inpL);
|
17508
|
+
} break;
|
17509
|
+
case 2:
|
17510
|
+
{
|
17511
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17512
|
+
layer.attn_norm,
|
17513
|
+
layer.attn_norm_b,
|
17514
|
+
LLM_NORM_GROUP, cb, 0);
|
17515
|
+
|
17516
|
+
struct lm_ggml_tensor * q;
|
17517
|
+
struct lm_ggml_tensor * k;
|
17518
|
+
struct lm_ggml_tensor * v;
|
17519
|
+
|
17520
|
+
q = lm_ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
|
17521
|
+
k = lm_ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
17522
|
+
v = lm_ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
17523
|
+
|
17524
|
+
q = lm_ggml_add(ctx0, q, layer.attn_q_b);
|
17525
|
+
k = lm_ggml_add(ctx0, k, layer.attn_k_b);
|
17526
|
+
v = lm_ggml_add(ctx0, v, layer.attn_v_b);
|
17527
|
+
|
17528
|
+
q = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, q));
|
17529
|
+
k = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, k));
|
17530
|
+
|
17531
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
17532
|
+
|
17533
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
|
17534
|
+
|
17535
|
+
cur = lm_ggml_mul_mat(ctx0, kq, v);
|
17536
|
+
|
17537
|
+
cur = lm_ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
|
17538
|
+
cur = lm_ggml_add(ctx0, cur, layer.attn_o_b);
|
17539
|
+
|
17540
|
+
cur = lm_ggml_add(ctx0, cur, inpL);
|
17541
|
+
} break;
|
17542
|
+
case 5:
|
17543
|
+
{
|
17544
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17545
|
+
layer.norm,
|
17546
|
+
layer.norm_b,
|
17547
|
+
LLM_NORM_GROUP, cb, 0);
|
17548
|
+
} break;
|
17549
|
+
default: LM_GGML_ABORT("unknown posnet layer");
|
17550
|
+
};
|
17551
|
+
}
|
17552
|
+
|
17553
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17554
|
+
|
17555
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17556
|
+
model.tok_norm,
|
17557
|
+
model.tok_norm_b,
|
17558
|
+
LLM_NORM, cb, -1);
|
17559
|
+
|
17560
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17561
|
+
|
17562
|
+
inpL = cur;
|
17563
|
+
|
17564
|
+
// convnext
|
17565
|
+
for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
|
17566
|
+
const auto & layer = model.layers[il].convnext;
|
17567
|
+
|
17568
|
+
cur = inpL;
|
17569
|
+
|
17570
|
+
cur = lm_ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
|
17571
|
+
cur = lm_ggml_add(ctx0, cur, layer.dw_b);
|
17572
|
+
|
17573
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17574
|
+
|
17575
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17576
|
+
layer.norm,
|
17577
|
+
layer.norm_b,
|
17578
|
+
LLM_NORM, cb, -1);
|
17579
|
+
|
17580
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
17581
|
+
layer.pw1, layer.pw1_b, NULL,
|
17582
|
+
NULL, NULL, NULL,
|
17583
|
+
layer.pw2, layer.pw2_b, NULL,
|
17584
|
+
NULL,
|
17585
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
17586
|
+
|
17587
|
+
cur = lm_ggml_mul(ctx0, cur, layer.gamma);
|
17588
|
+
|
17589
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17590
|
+
|
17591
|
+
inpL = lm_ggml_add(ctx0, cur, inpL);
|
17592
|
+
}
|
17593
|
+
|
17594
|
+
cur = inpL;
|
17595
|
+
|
17596
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17597
|
+
|
17598
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17599
|
+
model.output_norm,
|
17600
|
+
model.output_norm_b,
|
17601
|
+
LLM_NORM, cb, -1);
|
17602
|
+
|
17603
|
+
// lm_head
|
17604
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
17605
|
+
|
17606
|
+
cur = lm_ggml_add(ctx0, cur, model.output_b);
|
17607
|
+
cb(cur, "result_embd", -1);
|
17608
|
+
|
17609
|
+
lm_ggml_build_forward_expand(gf, cur);
|
17610
|
+
|
17611
|
+
return gf;
|
17612
|
+
}
|
16613
17613
|
};
|
16614
17614
|
|
16615
17615
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -16692,11 +17692,16 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16692
17692
|
|
16693
17693
|
switch (model.arch) {
|
16694
17694
|
case LLM_ARCH_LLAMA:
|
17695
|
+
case LLM_ARCH_MINICPM:
|
16695
17696
|
case LLM_ARCH_GRANITE:
|
16696
17697
|
case LLM_ARCH_GRANITE_MOE:
|
16697
17698
|
{
|
16698
17699
|
result = llm.build_llama();
|
16699
17700
|
} break;
|
17701
|
+
case LLM_ARCH_DECI:
|
17702
|
+
{
|
17703
|
+
result = llm.build_deci();
|
17704
|
+
} break;
|
16700
17705
|
case LLM_ARCH_BAICHUAN:
|
16701
17706
|
{
|
16702
17707
|
result = llm.build_baichuan();
|
@@ -16743,6 +17748,11 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16743
17748
|
{
|
16744
17749
|
result = llm.build_qwen2();
|
16745
17750
|
} break;
|
17751
|
+
case LLM_ARCH_QWEN2VL:
|
17752
|
+
{
|
17753
|
+
lctx.n_pos_per_token = 4;
|
17754
|
+
result = llm.build_qwen2vl();
|
17755
|
+
} break;
|
16746
17756
|
case LLM_ARCH_QWEN2MOE:
|
16747
17757
|
{
|
16748
17758
|
result = llm.build_qwen2moe();
|
@@ -16775,10 +17785,6 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16775
17785
|
{
|
16776
17786
|
result = llm.build_internlm2();
|
16777
17787
|
} break;
|
16778
|
-
case LLM_ARCH_MINICPM:
|
16779
|
-
{
|
16780
|
-
result = llm.build_minicpm();
|
16781
|
-
} break;
|
16782
17788
|
case LLM_ARCH_MINICPM3:
|
16783
17789
|
{
|
16784
17790
|
result = llm.build_minicpm3();
|
@@ -16835,6 +17841,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16835
17841
|
{
|
16836
17842
|
result = llm.build_arctic();
|
16837
17843
|
} break;
|
17844
|
+
case LLM_ARCH_DEEPSEEK:
|
17845
|
+
{
|
17846
|
+
result = llm.build_deepseek();
|
17847
|
+
} break;
|
16838
17848
|
case LLM_ARCH_DEEPSEEK2:
|
16839
17849
|
{
|
16840
17850
|
result = llm.build_deepseek2();
|
@@ -16850,14 +17860,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16850
17860
|
case LLM_ARCH_T5:
|
16851
17861
|
{
|
16852
17862
|
if (lctx.is_encoding) {
|
16853
|
-
result = llm.
|
17863
|
+
result = llm.build_t5_enc();
|
16854
17864
|
} else {
|
16855
|
-
result = llm.
|
17865
|
+
result = llm.build_t5_dec();
|
16856
17866
|
}
|
16857
17867
|
} break;
|
16858
17868
|
case LLM_ARCH_T5ENCODER:
|
16859
17869
|
{
|
16860
|
-
result = llm.
|
17870
|
+
result = llm.build_t5_enc();
|
16861
17871
|
} break;
|
16862
17872
|
case LLM_ARCH_JAIS:
|
16863
17873
|
{
|
@@ -16879,6 +17889,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16879
17889
|
{
|
16880
17890
|
result = llm.build_chameleon();
|
16881
17891
|
} break;
|
17892
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
17893
|
+
{
|
17894
|
+
result = llm.build_wavtokenizer_dec();
|
17895
|
+
} break;
|
16882
17896
|
default:
|
16883
17897
|
LM_GGML_ABORT("fatal error");
|
16884
17898
|
}
|
@@ -16965,35 +17979,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
|
16965
17979
|
|
16966
17980
|
if (ubatch.pos && lctx.inp_pos) {
|
16967
17981
|
const int64_t n_tokens = ubatch.n_tokens;
|
16968
|
-
|
16969
|
-
lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*lm_ggml_element_size(lctx.inp_pos));
|
17982
|
+
auto n_pos = lctx.n_pos_per_token;
|
17983
|
+
lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*lm_ggml_element_size(lctx.inp_pos));
|
16970
17984
|
}
|
16971
17985
|
|
16972
17986
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
16973
|
-
LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
16974
|
-
|
17987
|
+
//LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
17988
|
+
|
17989
|
+
if (!lctx.inp_out_ids) {
|
17990
|
+
LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
|
17991
|
+
} else {
|
17992
|
+
const int64_t n_tokens = ubatch.n_tokens;
|
16975
17993
|
|
16976
|
-
|
16977
|
-
|
17994
|
+
LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
17995
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
16978
17996
|
|
16979
|
-
|
16980
|
-
|
16981
|
-
|
16982
|
-
}
|
16983
|
-
} else if (ubatch.output) {
|
16984
|
-
int32_t n_outputs = 0;
|
16985
|
-
for (int i = 0; i < n_tokens; ++i) {
|
16986
|
-
if (ubatch.output[i]) {
|
16987
|
-
data[n_outputs++] = i;
|
17997
|
+
if (lctx.n_outputs == n_tokens) {
|
17998
|
+
for (int i = 0; i < n_tokens; ++i) {
|
17999
|
+
data[i] = i;
|
16988
18000
|
}
|
18001
|
+
} else if (ubatch.output) {
|
18002
|
+
int32_t n_outputs = 0;
|
18003
|
+
for (int i = 0; i < n_tokens; ++i) {
|
18004
|
+
if (ubatch.output[i]) {
|
18005
|
+
data[n_outputs++] = i;
|
18006
|
+
}
|
18007
|
+
}
|
18008
|
+
// the graph needs to have been passed the correct number of outputs
|
18009
|
+
LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
|
18010
|
+
} else if (lctx.n_outputs == 1) {
|
18011
|
+
// only keep last output
|
18012
|
+
data[0] = n_tokens - 1;
|
18013
|
+
} else {
|
18014
|
+
LM_GGML_ASSERT(lctx.n_outputs == 0);
|
16989
18015
|
}
|
16990
|
-
// the graph needs to have been passed the correct number of outputs
|
16991
|
-
LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
|
16992
|
-
} else if (lctx.n_outputs == 1) {
|
16993
|
-
// only keep last output
|
16994
|
-
data[0] = n_tokens - 1;
|
16995
|
-
} else {
|
16996
|
-
LM_GGML_ASSERT(lctx.n_outputs == 0);
|
16997
18016
|
}
|
16998
18017
|
}
|
16999
18018
|
|
@@ -17664,6 +18683,7 @@ static int llama_decode_internal(
|
|
17664
18683
|
embd = nullptr; // do not extract embeddings when not needed
|
17665
18684
|
LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
17666
18685
|
}
|
18686
|
+
|
17667
18687
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
17668
18688
|
|
17669
18689
|
lm_ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
@@ -18451,10 +19471,6 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
18451
19471
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
18452
19472
|
new_type = LM_GGML_TYPE_IQ3_S;
|
18453
19473
|
}
|
18454
|
-
else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 ||
|
18455
|
-
new_type == LM_GGML_TYPE_Q4_0_8_8) {
|
18456
|
-
new_type = LM_GGML_TYPE_Q4_0;
|
18457
|
-
}
|
18458
19474
|
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
18459
19475
|
new_type = LM_GGML_TYPE_Q4_K;
|
18460
19476
|
}
|
@@ -18777,9 +19793,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
18777
19793
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = LM_GGML_TYPE_IQ4_XS; break;
|
18778
19794
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = LM_GGML_TYPE_IQ3_S; break;
|
18779
19795
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = LM_GGML_TYPE_IQ3_S; break;
|
18780
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = LM_GGML_TYPE_Q4_0_4_4; break;
|
18781
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = LM_GGML_TYPE_Q4_0_4_8; break;
|
18782
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = LM_GGML_TYPE_Q4_0_8_8; break;
|
18783
19796
|
|
18784
19797
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
18785
19798
|
}
|
@@ -19118,14 +20131,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
19118
20131
|
f32_data = (float *) f32_conv_buf.data();
|
19119
20132
|
}
|
19120
20133
|
|
19121
|
-
int chunk_size_multiplier = 1;
|
19122
|
-
if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 || new_type == LM_GGML_TYPE_Q4_0_8_8) {
|
19123
|
-
if ((new_type == LM_GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = LM_GGML_TYPE_Q4_0;
|
19124
|
-
else if (tensor->ne[1] % 4 != 0) new_type = LM_GGML_TYPE_Q4_0;
|
19125
|
-
if (new_type == LM_GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
19126
|
-
else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
19127
|
-
}
|
19128
|
-
|
19129
20134
|
LLAMA_LOG_INFO("converting to %s .. ", lm_ggml_type_name(new_type));
|
19130
20135
|
fflush(stdout);
|
19131
20136
|
|
@@ -19138,8 +20143,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
19138
20143
|
const int64_t nrows = tensor->ne[1];
|
19139
20144
|
|
19140
20145
|
static const int64_t min_chunk_size = 32 * 512;
|
19141
|
-
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row))
|
19142
|
-
chunk_size_multiplier;
|
20146
|
+
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
19143
20147
|
|
19144
20148
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
19145
20149
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
@@ -20068,10 +21072,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
20068
21072
|
case LLM_ARCH_T5ENCODER:
|
20069
21073
|
case LLM_ARCH_JAIS:
|
20070
21074
|
case LLM_ARCH_RWKV6:
|
21075
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
20071
21076
|
return LLAMA_ROPE_TYPE_NONE;
|
20072
21077
|
|
20073
21078
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
20074
21079
|
case LLM_ARCH_LLAMA:
|
21080
|
+
case LLM_ARCH_DECI:
|
20075
21081
|
case LLM_ARCH_BAICHUAN:
|
20076
21082
|
case LLM_ARCH_STARCODER:
|
20077
21083
|
case LLM_ARCH_PLAMO:
|
@@ -20082,6 +21088,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
20082
21088
|
case LLM_ARCH_COMMAND_R:
|
20083
21089
|
case LLM_ARCH_OLMO:
|
20084
21090
|
case LLM_ARCH_ARCTIC:
|
21091
|
+
case LLM_ARCH_DEEPSEEK:
|
20085
21092
|
case LLM_ARCH_DEEPSEEK2:
|
20086
21093
|
case LLM_ARCH_CHATGLM:
|
20087
21094
|
case LLM_ARCH_GRANITE:
|
@@ -20115,6 +21122,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
20115
21122
|
case LLM_ARCH_MINICPM3:
|
20116
21123
|
return LLAMA_ROPE_TYPE_NEOX;
|
20117
21124
|
|
21125
|
+
case LLM_ARCH_QWEN2VL:
|
21126
|
+
return LLAMA_ROPE_TYPE_MROPE;
|
21127
|
+
|
20118
21128
|
// all model arches should be listed explicitly here
|
20119
21129
|
case LLM_ARCH_UNKNOWN:
|
20120
21130
|
LM_GGML_ABORT("unknown architecture");
|
@@ -20181,17 +21191,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
20181
21191
|
return model->n_elements;
|
20182
21192
|
}
|
20183
21193
|
|
20184
|
-
struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
20185
|
-
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
20186
|
-
[name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
|
20187
|
-
return it.first == name;
|
20188
|
-
});
|
20189
|
-
if (it == model->tensors_by_name.end()) {
|
20190
|
-
return nullptr;
|
20191
|
-
}
|
20192
|
-
return it->second;
|
20193
|
-
}
|
20194
|
-
|
20195
21194
|
bool llama_model_has_encoder(const struct llama_model * model) {
|
20196
21195
|
switch (model->arch) {
|
20197
21196
|
case LLM_ARCH_T5: return true;
|
@@ -21683,7 +22682,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
21683
22682
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
21684
22683
|
}
|
21685
22684
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
21686
|
-
throw std::runtime_error(format("out of range [0, %
|
22685
|
+
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
21687
22686
|
} else {
|
21688
22687
|
j = ctx->output_ids[i];
|
21689
22688
|
}
|
@@ -21854,18 +22853,115 @@ int32_t llama_detokenize(
|
|
21854
22853
|
// chat templates
|
21855
22854
|
//
|
21856
22855
|
|
22856
|
+
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
22857
|
+
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
|
22858
|
+
return LLM_CHAT_TEMPLATES.at(tmpl);
|
22859
|
+
}
|
22860
|
+
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
|
22861
|
+
return tmpl.find(haystack) != std::string::npos;
|
22862
|
+
};
|
22863
|
+
if (tmpl_contains("<|im_start|>")) {
|
22864
|
+
return LLM_CHAT_TEMPLATE_CHATML;
|
22865
|
+
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
22866
|
+
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
22867
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
22868
|
+
} else if (
|
22869
|
+
// catches official 'v1' template
|
22870
|
+
tmpl_contains("' [INST] ' + system_message")
|
22871
|
+
// catches official 'v3' and 'v3-tekken' templates
|
22872
|
+
|| tmpl_contains("[AVAILABLE_TOOLS]")
|
22873
|
+
) {
|
22874
|
+
// Official mistral 'v1', 'v3' and 'v3-tekken' templates
|
22875
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
22876
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
22877
|
+
if (tmpl_contains(" [INST]")) {
|
22878
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
|
22879
|
+
} else if (tmpl_contains("\"[INST]\"")) {
|
22880
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
|
22881
|
+
}
|
22882
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
22883
|
+
} else {
|
22884
|
+
// llama2 template and its variants
|
22885
|
+
// [variant] support system message
|
22886
|
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
22887
|
+
bool support_system_message = tmpl_contains("<<SYS>>");
|
22888
|
+
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
|
22889
|
+
bool strip_message = tmpl_contains("content.strip()");
|
22890
|
+
if (strip_message) {
|
22891
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
22892
|
+
} else if (add_bos_inside_history) {
|
22893
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
22894
|
+
} else if (support_system_message) {
|
22895
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
|
22896
|
+
} else {
|
22897
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2;
|
22898
|
+
}
|
22899
|
+
}
|
22900
|
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
22901
|
+
return LLM_CHAT_TEMPLATE_PHI_3;
|
22902
|
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
22903
|
+
return LLM_CHAT_TEMPLATE_FALCON_3;
|
22904
|
+
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
22905
|
+
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
22906
|
+
} else if (tmpl_contains("bos_token + message['role']")) {
|
22907
|
+
return LLM_CHAT_TEMPLATE_MONARCH;
|
22908
|
+
} else if (tmpl_contains("<start_of_turn>")) {
|
22909
|
+
return LLM_CHAT_TEMPLATE_GEMMA;
|
22910
|
+
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
|
22911
|
+
// OrionStarAI/Orion-14B-Chat
|
22912
|
+
return LLM_CHAT_TEMPLATE_ORION;
|
22913
|
+
} else if (tmpl_contains("GPT4 Correct ")) {
|
22914
|
+
// openchat/openchat-3.5-0106
|
22915
|
+
return LLM_CHAT_TEMPLATE_OPENCHAT;
|
22916
|
+
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
|
22917
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
22918
|
+
if (tmpl_contains("SYSTEM: ")) {
|
22919
|
+
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
|
22920
|
+
}
|
22921
|
+
return LLM_CHAT_TEMPLATE_VICUNA;
|
22922
|
+
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
|
22923
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
22924
|
+
return LLM_CHAT_TEMPLATE_DEEPSEEK;
|
22925
|
+
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
|
22926
|
+
// CohereForAI/c4ai-command-r-plus
|
22927
|
+
return LLM_CHAT_TEMPLATE_COMMAND_R;
|
22928
|
+
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
|
22929
|
+
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
22930
|
+
} else if (tmpl_contains("[gMASK]sop")) {
|
22931
|
+
// chatglm3-6b
|
22932
|
+
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
22933
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
22934
|
+
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
22935
|
+
} else if (tmpl_contains(LU8("<用户>"))) {
|
22936
|
+
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
22937
|
+
return LLM_CHAT_TEMPLATE_MINICPM;
|
22938
|
+
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
22939
|
+
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
22940
|
+
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
22941
|
+
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
22942
|
+
// EXAONE-3.0-7.8B-Instruct
|
22943
|
+
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
22944
|
+
} else if (tmpl_contains("rwkv-world")) {
|
22945
|
+
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
22946
|
+
} else if (tmpl_contains("<|start_of_role|>")) {
|
22947
|
+
return LLM_CHAT_TEMPLATE_GRANITE;
|
22948
|
+
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
22949
|
+
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
22950
|
+
} else if (tmpl_contains("<|role_start|>")) {
|
22951
|
+
return LLM_CHAT_TEMPLATE_MEGREZ;
|
22952
|
+
}
|
22953
|
+
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
22954
|
+
}
|
22955
|
+
|
21857
22956
|
// Simple version of "llama_apply_chat_template" that only works with strings
|
21858
22957
|
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
21859
22958
|
static int32_t llama_chat_apply_template_internal(
|
21860
|
-
const
|
22959
|
+
const llm_chat_template tmpl,
|
21861
22960
|
const std::vector<const llama_chat_message *> & chat,
|
21862
22961
|
std::string & dest, bool add_ass) {
|
21863
22962
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
21864
22963
|
std::stringstream ss;
|
21865
|
-
|
21866
|
-
return tmpl.find(haystack) != std::string::npos;
|
21867
|
-
};
|
21868
|
-
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
|
22964
|
+
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
|
21869
22965
|
// chatml template
|
21870
22966
|
for (auto message : chat) {
|
21871
22967
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
@@ -21873,16 +22969,59 @@ static int32_t llama_chat_apply_template_internal(
|
|
21873
22969
|
if (add_ass) {
|
21874
22970
|
ss << "<|im_start|>assistant\n";
|
21875
22971
|
}
|
21876
|
-
} else if (tmpl ==
|
22972
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
22973
|
+
// Official mistral 'v7' template
|
22974
|
+
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
22975
|
+
for (auto message : chat) {
|
22976
|
+
std::string role(message->role);
|
22977
|
+
std::string content(message->content);
|
22978
|
+
if (role == "system") {
|
22979
|
+
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
22980
|
+
} else if (role == "user") {
|
22981
|
+
ss << "[INST] " << content << "[/INST]";
|
22982
|
+
}
|
22983
|
+
else {
|
22984
|
+
ss << " " << content << "</s>";
|
22985
|
+
}
|
22986
|
+
}
|
22987
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
22988
|
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|
22989
|
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
|
22990
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
22991
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
22992
|
+
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
|
22993
|
+
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
|
22994
|
+
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
22995
|
+
bool is_inside_turn = false;
|
22996
|
+
for (auto message : chat) {
|
22997
|
+
if (!is_inside_turn) {
|
22998
|
+
ss << leading_space << "[INST]" << trailing_space;
|
22999
|
+
is_inside_turn = true;
|
23000
|
+
}
|
23001
|
+
std::string role(message->role);
|
23002
|
+
std::string content(message->content);
|
23003
|
+
if (role == "system") {
|
23004
|
+
ss << content << "\n\n";
|
23005
|
+
} else if (role == "user") {
|
23006
|
+
ss << content << leading_space << "[/INST]";
|
23007
|
+
} else {
|
23008
|
+
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
|
23009
|
+
is_inside_turn = false;
|
23010
|
+
}
|
23011
|
+
}
|
23012
|
+
} else if (
|
23013
|
+
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|
23014
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|
23015
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|
23016
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
|
21877
23017
|
// llama2 template and its variants
|
21878
23018
|
// [variant] support system message
|
21879
|
-
|
21880
|
-
|
21881
|
-
bool space_around_response = tmpl_contains("' ' + eos_token");
|
23019
|
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
23020
|
+
bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
|
21882
23021
|
// [variant] add BOS inside history
|
21883
|
-
bool add_bos_inside_history =
|
23022
|
+
bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
21884
23023
|
// [variant] trim spaces from the input message
|
21885
|
-
bool strip_message =
|
23024
|
+
bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
21886
23025
|
// construct the prompt
|
21887
23026
|
bool is_inside_turn = true; // skip BOS at the beginning
|
21888
23027
|
ss << "[INST] ";
|
@@ -21903,12 +23042,11 @@ static int32_t llama_chat_apply_template_internal(
|
|
21903
23042
|
} else if (role == "user") {
|
21904
23043
|
ss << content << " [/INST]";
|
21905
23044
|
} else {
|
21906
|
-
ss <<
|
23045
|
+
ss << content << "</s>";
|
21907
23046
|
is_inside_turn = false;
|
21908
23047
|
}
|
21909
23048
|
}
|
21910
|
-
|
21911
|
-
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
|
23049
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
|
21912
23050
|
// Phi 3
|
21913
23051
|
for (auto message : chat) {
|
21914
23052
|
std::string role(message->role);
|
@@ -21917,7 +23055,16 @@ static int32_t llama_chat_apply_template_internal(
|
|
21917
23055
|
if (add_ass) {
|
21918
23056
|
ss << "<|assistant|>\n";
|
21919
23057
|
}
|
21920
|
-
} else if (tmpl ==
|
23058
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
23059
|
+
// Falcon 3
|
23060
|
+
for (auto message : chat) {
|
23061
|
+
std::string role(message->role);
|
23062
|
+
ss << "<|" << role << "|>\n" << message->content << "\n";
|
23063
|
+
}
|
23064
|
+
if (add_ass) {
|
23065
|
+
ss << "<|assistant|>\n";
|
23066
|
+
}
|
23067
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
21921
23068
|
// zephyr template
|
21922
23069
|
for (auto message : chat) {
|
21923
23070
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
@@ -21925,7 +23072,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21925
23072
|
if (add_ass) {
|
21926
23073
|
ss << "<|assistant|>\n";
|
21927
23074
|
}
|
21928
|
-
} else if (tmpl ==
|
23075
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
|
21929
23076
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
21930
23077
|
for (auto message : chat) {
|
21931
23078
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
@@ -21934,7 +23081,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21934
23081
|
if (add_ass) {
|
21935
23082
|
ss << "<s>assistant\n";
|
21936
23083
|
}
|
21937
|
-
} else if (tmpl ==
|
23084
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
|
21938
23085
|
// google/gemma-7b-it
|
21939
23086
|
std::string system_prompt = "";
|
21940
23087
|
for (auto message : chat) {
|
@@ -21956,7 +23103,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21956
23103
|
if (add_ass) {
|
21957
23104
|
ss << "<start_of_turn>model\n";
|
21958
23105
|
}
|
21959
|
-
} else if (tmpl ==
|
23106
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
|
21960
23107
|
// OrionStarAI/Orion-14B-Chat
|
21961
23108
|
std::string system_prompt = "";
|
21962
23109
|
for (auto message : chat) {
|
@@ -21976,7 +23123,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21976
23123
|
ss << message->content << "</s>";
|
21977
23124
|
}
|
21978
23125
|
}
|
21979
|
-
} else if (tmpl ==
|
23126
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
|
21980
23127
|
// openchat/openchat-3.5-0106,
|
21981
23128
|
for (auto message : chat) {
|
21982
23129
|
std::string role(message->role);
|
@@ -21990,13 +23137,13 @@ static int32_t llama_chat_apply_template_internal(
|
|
21990
23137
|
if (add_ass) {
|
21991
23138
|
ss << "GPT4 Correct Assistant:";
|
21992
23139
|
}
|
21993
|
-
} else if (tmpl ==
|
23140
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
21994
23141
|
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
21995
23142
|
for (auto message : chat) {
|
21996
23143
|
std::string role(message->role);
|
21997
23144
|
if (role == "system") {
|
21998
23145
|
// Orca-Vicuna variant uses a system prefix
|
21999
|
-
if (tmpl ==
|
23146
|
+
if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
22000
23147
|
ss << "SYSTEM: " << message->content << "\n";
|
22001
23148
|
} else {
|
22002
23149
|
ss << message->content << "\n\n";
|
@@ -22010,7 +23157,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22010
23157
|
if (add_ass) {
|
22011
23158
|
ss << "ASSISTANT:";
|
22012
23159
|
}
|
22013
|
-
} else if (tmpl ==
|
23160
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
|
22014
23161
|
// deepseek-ai/deepseek-coder-33b-instruct
|
22015
23162
|
for (auto message : chat) {
|
22016
23163
|
std::string role(message->role);
|
@@ -22025,7 +23172,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22025
23172
|
if (add_ass) {
|
22026
23173
|
ss << "### Response:\n";
|
22027
23174
|
}
|
22028
|
-
} else if (tmpl ==
|
23175
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
|
22029
23176
|
// CohereForAI/c4ai-command-r-plus
|
22030
23177
|
for (auto message : chat) {
|
22031
23178
|
std::string role(message->role);
|
@@ -22040,7 +23187,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22040
23187
|
if (add_ass) {
|
22041
23188
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
22042
23189
|
}
|
22043
|
-
} else if (tmpl ==
|
23190
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
|
22044
23191
|
// Llama 3
|
22045
23192
|
for (auto message : chat) {
|
22046
23193
|
std::string role(message->role);
|
@@ -22049,7 +23196,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22049
23196
|
if (add_ass) {
|
22050
23197
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
22051
23198
|
}
|
22052
|
-
} else if (tmpl ==
|
23199
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
22053
23200
|
// chatglm3-6b
|
22054
23201
|
ss << "[gMASK]" << "sop";
|
22055
23202
|
for (auto message : chat) {
|
@@ -22059,7 +23206,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22059
23206
|
if (add_ass) {
|
22060
23207
|
ss << "<|assistant|>";
|
22061
23208
|
}
|
22062
|
-
} else if (tmpl ==
|
23209
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
22063
23210
|
ss << "[gMASK]" << "<sop>";
|
22064
23211
|
for (auto message : chat) {
|
22065
23212
|
std::string role(message->role);
|
@@ -22068,7 +23215,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22068
23215
|
if (add_ass) {
|
22069
23216
|
ss << "<|assistant|>";
|
22070
23217
|
}
|
22071
|
-
} else if (tmpl ==
|
23218
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
22072
23219
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
22073
23220
|
for (auto message : chat) {
|
22074
23221
|
std::string role(message->role);
|
@@ -22080,7 +23227,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22080
23227
|
ss << trim(message->content);
|
22081
23228
|
}
|
22082
23229
|
}
|
22083
|
-
} else if (tmpl ==
|
23230
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
|
22084
23231
|
// DeepSeek-V2
|
22085
23232
|
for (auto message : chat) {
|
22086
23233
|
std::string role(message->role);
|
@@ -22095,7 +23242,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22095
23242
|
if (add_ass) {
|
22096
23243
|
ss << "Assistant:";
|
22097
23244
|
}
|
22098
|
-
} else if (tmpl ==
|
23245
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
22099
23246
|
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
22100
23247
|
// EXAONE-3.0-7.8B-Instruct
|
22101
23248
|
for (auto message : chat) {
|
@@ -22111,7 +23258,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22111
23258
|
if (add_ass) {
|
22112
23259
|
ss << "[|assistant|]";
|
22113
23260
|
}
|
22114
|
-
} else if (tmpl ==
|
23261
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
22115
23262
|
// this template requires the model to have "\n\n" as EOT token
|
22116
23263
|
for (auto message : chat) {
|
22117
23264
|
std::string role(message->role);
|
@@ -22121,7 +23268,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22121
23268
|
ss << message->content << "\n\n";
|
22122
23269
|
}
|
22123
23270
|
}
|
22124
|
-
} else if (tmpl ==
|
23271
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
|
22125
23272
|
// IBM Granite template
|
22126
23273
|
for (const auto & message : chat) {
|
22127
23274
|
std::string role(message->role);
|
@@ -22134,6 +23281,42 @@ static int32_t llama_chat_apply_template_internal(
|
|
22134
23281
|
if (add_ass) {
|
22135
23282
|
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
22136
23283
|
}
|
23284
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
23285
|
+
// GigaChat template
|
23286
|
+
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
23287
|
+
|
23288
|
+
// Handle system message if present
|
23289
|
+
if (has_system) {
|
23290
|
+
ss << "<s>" << chat[0]->content << "<|message_sep|>";
|
23291
|
+
} else {
|
23292
|
+
ss << "<s>";
|
23293
|
+
}
|
23294
|
+
|
23295
|
+
// Process remaining messages
|
23296
|
+
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
|
23297
|
+
std::string role(chat[i]->role);
|
23298
|
+
if (role == "user") {
|
23299
|
+
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
|
23300
|
+
<< "available functions<|role_sep|>[]<|message_sep|>";
|
23301
|
+
} else if (role == "assistant") {
|
23302
|
+
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
|
23303
|
+
}
|
23304
|
+
}
|
23305
|
+
|
23306
|
+
// Add generation prompt if needed
|
23307
|
+
if (add_ass) {
|
23308
|
+
ss << "assistant<|role_sep|>";
|
23309
|
+
}
|
23310
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
|
23311
|
+
// Megrez template
|
23312
|
+
for (auto message : chat) {
|
23313
|
+
std::string role(message->role);
|
23314
|
+
ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
|
23315
|
+
}
|
23316
|
+
|
23317
|
+
if (add_ass) {
|
23318
|
+
ss << "<|role_start|>assistant<|role_end|>";
|
23319
|
+
}
|
22137
23320
|
} else {
|
22138
23321
|
// template not supported
|
22139
23322
|
return -1;
|
@@ -22153,15 +23336,15 @@ int32_t llama_chat_apply_template(
|
|
22153
23336
|
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
22154
23337
|
if (tmpl == nullptr) {
|
22155
23338
|
LM_GGML_ASSERT(model != nullptr);
|
22156
|
-
|
22157
|
-
|
22158
|
-
|
22159
|
-
|
22160
|
-
|
23339
|
+
|
23340
|
+
// load template from model, if available
|
23341
|
+
const auto & it = model->lm_gguf_kv.find("tokenizer.chat_template");
|
23342
|
+
if (it != model->lm_gguf_kv.end() && it->second.size() > 0) {
|
23343
|
+
curr_tmpl = it->second;
|
23344
|
+
}
|
23345
|
+
else {
|
22161
23346
|
// worst case: there is no information about template, we will use chatml by default
|
22162
|
-
curr_tmpl = "chatml";
|
22163
|
-
} else {
|
22164
|
-
curr_tmpl = std::string(model_template.data(), model_template.size());
|
23347
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
22165
23348
|
}
|
22166
23349
|
}
|
22167
23350
|
|
@@ -22173,7 +23356,11 @@ int32_t llama_chat_apply_template(
|
|
22173
23356
|
}
|
22174
23357
|
|
22175
23358
|
std::string formatted_chat;
|
22176
|
-
|
23359
|
+
llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
|
23360
|
+
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
|
23361
|
+
return -1;
|
23362
|
+
}
|
23363
|
+
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
|
22177
23364
|
if (res < 0) {
|
22178
23365
|
return res;
|
22179
23366
|
}
|
@@ -22183,6 +23370,15 @@ int32_t llama_chat_apply_template(
|
|
22183
23370
|
return res;
|
22184
23371
|
}
|
22185
23372
|
|
23373
|
+
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
23374
|
+
auto it = LLM_CHAT_TEMPLATES.begin();
|
23375
|
+
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
|
23376
|
+
output[i] = it->first.c_str();
|
23377
|
+
std::advance(it, 1);
|
23378
|
+
}
|
23379
|
+
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
23380
|
+
}
|
23381
|
+
|
22186
23382
|
//
|
22187
23383
|
// sampling
|
22188
23384
|
//
|