llama_cpp 0.15.2 → 0.15.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
@@ -26,16 +26,9 @@
|
|
26
26
|
#ifdef GGML_USE_METAL
|
27
27
|
# include "ggml-metal.h"
|
28
28
|
#endif
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
#ifndef QK_K
|
33
|
-
# ifdef GGML_QKK_64
|
34
|
-
# define QK_K 64
|
35
|
-
# else
|
36
|
-
# define QK_K 256
|
37
|
-
# endif
|
38
|
-
#endif
|
29
|
+
|
30
|
+
// TODO: replace with ggml API call
|
31
|
+
#define QK_K 256
|
39
32
|
|
40
33
|
#ifdef __has_include
|
41
34
|
#if __has_include(<unistd.h>)
|
@@ -110,7 +103,7 @@
|
|
110
103
|
#endif
|
111
104
|
|
112
105
|
#define LLAMA_MAX_NODES 8192
|
113
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 160
|
114
107
|
|
115
108
|
//
|
116
109
|
// logging
|
@@ -205,7 +198,6 @@ enum llm_arch {
|
|
205
198
|
LLM_ARCH_GPTNEOX,
|
206
199
|
LLM_ARCH_MPT,
|
207
200
|
LLM_ARCH_STARCODER,
|
208
|
-
LLM_ARCH_PERSIMMON,
|
209
201
|
LLM_ARCH_REFACT,
|
210
202
|
LLM_ARCH_BERT,
|
211
203
|
LLM_ARCH_NOMIC_BERT,
|
@@ -229,6 +221,8 @@ enum llm_arch {
|
|
229
221
|
LLM_ARCH_COMMAND_R,
|
230
222
|
LLM_ARCH_DBRX,
|
231
223
|
LLM_ARCH_OLMO,
|
224
|
+
LLM_ARCH_ARCTIC,
|
225
|
+
LLM_ARCH_DEEPSEEK2,
|
232
226
|
LLM_ARCH_UNKNOWN,
|
233
227
|
};
|
234
228
|
|
@@ -242,7 +236,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
242
236
|
{ LLM_ARCH_MPT, "mpt" },
|
243
237
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
238
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
-
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
239
|
{ LLM_ARCH_REFACT, "refact" },
|
247
240
|
{ LLM_ARCH_BERT, "bert" },
|
248
241
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
@@ -266,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
266
259
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
260
|
{ LLM_ARCH_DBRX, "dbrx" },
|
268
261
|
{ LLM_ARCH_OLMO, "olmo" },
|
262
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
263
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
269
264
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
270
265
|
};
|
271
266
|
|
@@ -286,11 +281,15 @@ enum llm_kv {
|
|
286
281
|
LLM_KV_CONTEXT_LENGTH,
|
287
282
|
LLM_KV_EMBEDDING_LENGTH,
|
288
283
|
LLM_KV_BLOCK_COUNT,
|
284
|
+
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
289
285
|
LLM_KV_FEED_FORWARD_LENGTH,
|
286
|
+
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
290
287
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
291
288
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
292
289
|
LLM_KV_EXPERT_COUNT,
|
293
290
|
LLM_KV_EXPERT_USED_COUNT,
|
291
|
+
LLM_KV_EXPERT_SHARED_COUNT,
|
292
|
+
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
294
293
|
LLM_KV_POOLING_TYPE,
|
295
294
|
LLM_KV_LOGIT_SCALE,
|
296
295
|
|
@@ -303,14 +302,18 @@ enum llm_kv {
|
|
303
302
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
304
303
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
305
304
|
LLM_KV_ATTENTION_CAUSAL,
|
305
|
+
LLM_KV_ATTENTION_Q_LORA_RANK,
|
306
|
+
LLM_KV_ATTENTION_KV_LORA_RANK,
|
306
307
|
|
307
308
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
308
309
|
LLM_KV_ROPE_FREQ_BASE,
|
309
310
|
LLM_KV_ROPE_SCALE_LINEAR,
|
310
311
|
LLM_KV_ROPE_SCALING_TYPE,
|
311
312
|
LLM_KV_ROPE_SCALING_FACTOR,
|
313
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
312
314
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
313
315
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
316
|
+
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
314
317
|
|
315
318
|
LLM_KV_SPLIT_NO,
|
316
319
|
LLM_KV_SPLIT_COUNT,
|
@@ -359,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
359
362
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
360
363
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
361
364
|
|
362
|
-
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size"
|
363
|
-
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length"
|
364
|
-
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"
|
365
|
-
{ LLM_KV_BLOCK_COUNT, "%s.block_count"
|
366
|
-
{
|
367
|
-
{
|
368
|
-
{
|
369
|
-
{
|
370
|
-
{
|
371
|
-
{
|
372
|
-
{
|
365
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
366
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
367
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
368
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
369
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
370
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
371
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
372
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
373
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
374
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
375
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
376
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
377
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
378
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
379
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
373
380
|
|
374
381
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
375
382
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -380,14 +387,18 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
380
387
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
381
388
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
382
389
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
390
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
391
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
383
392
|
|
384
393
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
385
394
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
386
395
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
387
396
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
388
397
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
398
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
389
399
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
390
400
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
401
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
391
402
|
|
392
403
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
393
404
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
@@ -441,6 +452,8 @@ enum llm_tensor {
|
|
441
452
|
LLM_TENSOR_OUTPUT,
|
442
453
|
LLM_TENSOR_OUTPUT_NORM,
|
443
454
|
LLM_TENSOR_ROPE_FREQS,
|
455
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
456
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
444
457
|
LLM_TENSOR_ATTN_Q,
|
445
458
|
LLM_TENSOR_ATTN_K,
|
446
459
|
LLM_TENSOR_ATTN_V,
|
@@ -460,6 +473,7 @@ enum llm_tensor {
|
|
460
473
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
461
474
|
LLM_TENSOR_FFN_GATE_EXP,
|
462
475
|
LLM_TENSOR_FFN_UP_EXP,
|
476
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
463
477
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
464
478
|
LLM_TENSOR_FFN_GATE_EXPS,
|
465
479
|
LLM_TENSOR_FFN_UP_EXPS,
|
@@ -476,6 +490,12 @@ enum llm_tensor {
|
|
476
490
|
LLM_TENSOR_SSM_A,
|
477
491
|
LLM_TENSOR_SSM_D,
|
478
492
|
LLM_TENSOR_SSM_OUT,
|
493
|
+
LLM_TENSOR_ATTN_Q_A,
|
494
|
+
LLM_TENSOR_ATTN_Q_B,
|
495
|
+
LLM_TENSOR_ATTN_KV_A_MQA,
|
496
|
+
LLM_TENSOR_ATTN_KV_B,
|
497
|
+
LLM_TENSOR_ATTN_Q_A_NORM,
|
498
|
+
LLM_TENSOR_ATTN_KV_A_NORM,
|
479
499
|
};
|
480
500
|
|
481
501
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -598,23 +618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
598
618
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
599
619
|
},
|
600
620
|
},
|
601
|
-
{
|
602
|
-
LLM_ARCH_PERSIMMON,
|
603
|
-
{
|
604
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
605
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
606
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
607
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
608
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
609
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
610
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
611
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
612
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
613
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
614
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
615
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
616
|
-
},
|
617
|
-
},
|
618
621
|
{
|
619
622
|
LLM_ARCH_MPT,
|
620
623
|
{
|
@@ -825,18 +828,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
825
828
|
{
|
826
829
|
LLM_ARCH_PHI3,
|
827
830
|
{
|
828
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
829
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
830
|
-
{ LLM_TENSOR_OUTPUT,
|
831
|
-
{
|
832
|
-
{
|
833
|
-
{
|
834
|
-
{
|
835
|
-
{
|
836
|
-
{
|
837
|
-
{
|
838
|
-
{
|
839
|
-
{
|
831
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
832
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
833
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
834
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
835
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
836
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
837
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
838
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
839
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
840
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
841
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
842
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
843
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
844
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
840
845
|
},
|
841
846
|
},
|
842
847
|
{
|
@@ -1052,6 +1057,57 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1052
1057
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1053
1058
|
},
|
1054
1059
|
},
|
1060
|
+
{
|
1061
|
+
LLM_ARCH_ARCTIC,
|
1062
|
+
{
|
1063
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1064
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1065
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1066
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1067
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1068
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1069
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1070
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1071
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1072
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1073
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1074
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1075
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1076
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
1077
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1078
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1079
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1080
|
+
},
|
1081
|
+
},
|
1082
|
+
{
|
1083
|
+
LLM_ARCH_DEEPSEEK2,
|
1084
|
+
{
|
1085
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1086
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1087
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1088
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1089
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1090
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1091
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1092
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1093
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1094
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1095
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1096
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1097
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1098
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1099
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1100
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1101
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1102
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1103
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1104
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1105
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1106
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1107
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1108
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1109
|
+
},
|
1110
|
+
},
|
1055
1111
|
{
|
1056
1112
|
LLM_ARCH_UNKNOWN,
|
1057
1113
|
{
|
@@ -1646,12 +1702,13 @@ struct llama_mlock {
|
|
1646
1702
|
};
|
1647
1703
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1648
1704
|
|
1649
|
-
|
1705
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
1706
|
+
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
1650
1707
|
std::vector<char> result(8, 0);
|
1651
|
-
const int n_tokens = llama_token_to_piece(
|
1708
|
+
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1652
1709
|
if (n_tokens < 0) {
|
1653
1710
|
result.resize(-n_tokens);
|
1654
|
-
int check = llama_token_to_piece(
|
1711
|
+
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1655
1712
|
GGML_ASSERT(check == -n_tokens);
|
1656
1713
|
}
|
1657
1714
|
else {
|
@@ -1697,6 +1754,8 @@ struct llama_state {
|
|
1697
1754
|
llama_state() {
|
1698
1755
|
#ifdef GGML_USE_METAL
|
1699
1756
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1757
|
+
#elif defined(GGML_USE_CUDA)
|
1758
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
1700
1759
|
#endif
|
1701
1760
|
}
|
1702
1761
|
|
@@ -1710,23 +1769,31 @@ static llama_state g_state;
|
|
1710
1769
|
// available llama models
|
1711
1770
|
enum e_model {
|
1712
1771
|
MODEL_UNKNOWN,
|
1772
|
+
MODEL_14M,
|
1713
1773
|
MODEL_17M,
|
1714
1774
|
MODEL_22M,
|
1715
1775
|
MODEL_33M,
|
1776
|
+
MODEL_70M,
|
1716
1777
|
MODEL_109M,
|
1717
1778
|
MODEL_137M,
|
1779
|
+
MODEL_160M,
|
1718
1780
|
MODEL_335M,
|
1781
|
+
MODEL_410M,
|
1719
1782
|
MODEL_0_5B,
|
1720
1783
|
MODEL_1B,
|
1784
|
+
MODEL_1_4B,
|
1721
1785
|
MODEL_2B,
|
1786
|
+
MODEL_2_8B,
|
1722
1787
|
MODEL_3B,
|
1723
1788
|
MODEL_4B,
|
1789
|
+
MODEL_6_9B,
|
1724
1790
|
MODEL_7B,
|
1725
1791
|
MODEL_8B,
|
1726
1792
|
MODEL_12B,
|
1727
1793
|
MODEL_13B,
|
1728
1794
|
MODEL_14B,
|
1729
1795
|
MODEL_15B,
|
1796
|
+
MODEL_16B,
|
1730
1797
|
MODEL_20B,
|
1731
1798
|
MODEL_30B,
|
1732
1799
|
MODEL_34B,
|
@@ -1734,6 +1801,7 @@ enum e_model {
|
|
1734
1801
|
MODEL_40B,
|
1735
1802
|
MODEL_65B,
|
1736
1803
|
MODEL_70B,
|
1804
|
+
MODEL_236B,
|
1737
1805
|
MODEL_314B,
|
1738
1806
|
MODEL_SMALL,
|
1739
1807
|
MODEL_MEDIUM,
|
@@ -1743,6 +1811,7 @@ enum e_model {
|
|
1743
1811
|
MODEL_8x7B,
|
1744
1812
|
MODEL_8x22B,
|
1745
1813
|
MODEL_16x12B,
|
1814
|
+
MODEL_10B_128x3_66B,
|
1746
1815
|
};
|
1747
1816
|
|
1748
1817
|
static const size_t kiB = 1024;
|
@@ -1752,6 +1821,7 @@ static const size_t GiB = 1024*MiB;
|
|
1752
1821
|
struct llama_hparams {
|
1753
1822
|
bool vocab_only;
|
1754
1823
|
bool rope_finetuned;
|
1824
|
+
bool use_par_res;
|
1755
1825
|
|
1756
1826
|
uint32_t n_vocab;
|
1757
1827
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -1767,12 +1837,21 @@ struct llama_hparams {
|
|
1767
1837
|
uint32_t n_expert_used = 0;
|
1768
1838
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1769
1839
|
|
1840
|
+
uint32_t n_layer_dense_lead = 0;
|
1841
|
+
uint32_t n_lora_q = 0;
|
1842
|
+
uint32_t n_lora_kv = 0;
|
1843
|
+
uint32_t n_ff_exp = 0;
|
1844
|
+
uint32_t n_expert_shared = 0;
|
1845
|
+
float expert_weights_scale = 0.0;
|
1846
|
+
|
1770
1847
|
float f_norm_eps;
|
1771
1848
|
float f_norm_rms_eps;
|
1772
1849
|
|
1850
|
+
float rope_attn_factor = 1.0f;
|
1773
1851
|
float rope_freq_base_train;
|
1774
1852
|
float rope_freq_scale_train;
|
1775
1853
|
uint32_t n_yarn_orig_ctx;
|
1854
|
+
float rope_yarn_log_mul;
|
1776
1855
|
|
1777
1856
|
// for State Space Models
|
1778
1857
|
uint32_t ssm_d_conv = 0;
|
@@ -1806,6 +1885,12 @@ struct llama_hparams {
|
|
1806
1885
|
if (this->n_expert != other.n_expert) return true;
|
1807
1886
|
if (this->n_expert_used != other.n_expert_used) return true;
|
1808
1887
|
|
1888
|
+
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
1889
|
+
if (this->n_lora_q != other.n_lora_q) return true;
|
1890
|
+
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1891
|
+
if (this->n_ff_exp != other.n_ff_exp) return true;
|
1892
|
+
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1893
|
+
|
1809
1894
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1810
1895
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1811
1896
|
|
@@ -1818,8 +1903,11 @@ struct llama_hparams {
|
|
1818
1903
|
|
1819
1904
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1820
1905
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1906
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1821
1907
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1822
1908
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1909
|
+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
1910
|
+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
1823
1911
|
|
1824
1912
|
return false;
|
1825
1913
|
}
|
@@ -1895,6 +1983,8 @@ struct llama_layer {
|
|
1895
1983
|
struct ggml_tensor * attn_k_norm_b;
|
1896
1984
|
struct ggml_tensor * attn_out_norm;
|
1897
1985
|
struct ggml_tensor * attn_out_norm_b;
|
1986
|
+
struct ggml_tensor * attn_q_a_norm;
|
1987
|
+
struct ggml_tensor * attn_kv_a_norm;
|
1898
1988
|
|
1899
1989
|
// attention
|
1900
1990
|
struct ggml_tensor * wq;
|
@@ -1902,6 +1992,10 @@ struct llama_layer {
|
|
1902
1992
|
struct ggml_tensor * wv;
|
1903
1993
|
struct ggml_tensor * wo;
|
1904
1994
|
struct ggml_tensor * wqkv;
|
1995
|
+
struct ggml_tensor * wq_a;
|
1996
|
+
struct ggml_tensor * wq_b;
|
1997
|
+
struct ggml_tensor * wkv_a_mqa;
|
1998
|
+
struct ggml_tensor * wkv_b;
|
1905
1999
|
|
1906
2000
|
// attention bias
|
1907
2001
|
struct ggml_tensor * bq;
|
@@ -1915,6 +2009,7 @@ struct llama_layer {
|
|
1915
2009
|
struct ggml_tensor * ffn_norm_b;
|
1916
2010
|
struct ggml_tensor * layer_out_norm;
|
1917
2011
|
struct ggml_tensor * layer_out_norm_b;
|
2012
|
+
struct ggml_tensor * ffn_norm_exps;
|
1918
2013
|
|
1919
2014
|
// ff
|
1920
2015
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1934,8 +2029,9 @@ struct llama_layer {
|
|
1934
2029
|
struct ggml_tensor * ffn_up_shexp;
|
1935
2030
|
|
1936
2031
|
// ff bias
|
1937
|
-
struct ggml_tensor *
|
1938
|
-
struct ggml_tensor *
|
2032
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
2033
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
2034
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
1939
2035
|
struct ggml_tensor * ffn_act;
|
1940
2036
|
|
1941
2037
|
// mamba proj
|
@@ -1952,6 +2048,10 @@ struct llama_layer {
|
|
1952
2048
|
// mamba bias
|
1953
2049
|
struct ggml_tensor * ssm_conv1d_b;
|
1954
2050
|
struct ggml_tensor * ssm_dt_b;
|
2051
|
+
|
2052
|
+
// long rope factors
|
2053
|
+
struct ggml_tensor * rope_long = nullptr;
|
2054
|
+
struct ggml_tensor * rope_short = nullptr;
|
1955
2055
|
};
|
1956
2056
|
|
1957
2057
|
struct llama_kv_cell {
|
@@ -2063,7 +2163,9 @@ struct llama_vocab {
|
|
2063
2163
|
std::unordered_map<token, id> token_to_id;
|
2064
2164
|
std::vector<token_data> id_to_token;
|
2065
2165
|
|
2066
|
-
std::
|
2166
|
+
std::vector<id> cache_special_tokens;
|
2167
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
2168
|
+
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
2067
2169
|
|
2068
2170
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2069
2171
|
|
@@ -2268,10 +2370,6 @@ struct llama_context {
|
|
2268
2370
|
|
2269
2371
|
// control vectors
|
2270
2372
|
struct llama_control_vector cvec;
|
2271
|
-
|
2272
|
-
#ifdef GGML_USE_MPI
|
2273
|
-
ggml_mpi_context * ctx_mpi = NULL;
|
2274
|
-
#endif
|
2275
2373
|
};
|
2276
2374
|
|
2277
2375
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
@@ -2491,7 +2589,6 @@ static bool llama_kv_cache_init(
|
|
2491
2589
|
static bool llama_kv_cache_find_slot(
|
2492
2590
|
struct llama_kv_cache & cache,
|
2493
2591
|
const struct llama_batch & batch) {
|
2494
|
-
const uint32_t n_ctx = cache.size;
|
2495
2592
|
const uint32_t n_tokens = batch.n_tokens;
|
2496
2593
|
|
2497
2594
|
if (cache.recurrent) {
|
@@ -2542,16 +2639,16 @@ static bool llama_kv_cache_find_slot(
|
|
2542
2639
|
}
|
2543
2640
|
// otherwise, one cell per token.
|
2544
2641
|
|
2545
|
-
if (n_tokens >
|
2546
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
2642
|
+
if (n_tokens > cache.size) {
|
2643
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
2547
2644
|
return false;
|
2548
2645
|
}
|
2549
2646
|
|
2550
2647
|
uint32_t n_tested = 0;
|
2551
2648
|
|
2552
2649
|
while (true) {
|
2553
|
-
if (cache.head + n_tokens >
|
2554
|
-
n_tested +=
|
2650
|
+
if (cache.head + n_tokens > cache.size) {
|
2651
|
+
n_tested += cache.size - cache.head;
|
2555
2652
|
cache.head = 0;
|
2556
2653
|
continue;
|
2557
2654
|
}
|
@@ -2570,7 +2667,7 @@ static bool llama_kv_cache_find_slot(
|
|
2570
2667
|
break;
|
2571
2668
|
}
|
2572
2669
|
|
2573
|
-
if (n_tested >=
|
2670
|
+
if (n_tested >= cache.size) {
|
2574
2671
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
2575
2672
|
return false;
|
2576
2673
|
}
|
@@ -3330,6 +3427,39 @@ struct llama_model_loader {
|
|
3330
3427
|
return get_arr_n(llm_kv(kid), result, required);
|
3331
3428
|
}
|
3332
3429
|
|
3430
|
+
template<typename T>
|
3431
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
3432
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
3433
|
+
|
3434
|
+
if (kid < 0) {
|
3435
|
+
if (required) {
|
3436
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3437
|
+
}
|
3438
|
+
return false;
|
3439
|
+
}
|
3440
|
+
|
3441
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
3442
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
3443
|
+
|
3444
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
3445
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
3446
|
+
}
|
3447
|
+
|
3448
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
3449
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
3450
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
3451
|
+
|
3452
|
+
result.resize(arr_info.length);
|
3453
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
3454
|
+
|
3455
|
+
return true;
|
3456
|
+
}
|
3457
|
+
|
3458
|
+
template<typename T>
|
3459
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
3460
|
+
return get_arr(llm_kv(kid), result, required);
|
3461
|
+
}
|
3462
|
+
|
3333
3463
|
template<typename T>
|
3334
3464
|
bool get_key(const std::string & key, T & result, const bool required = true) {
|
3335
3465
|
auto it = kv_overrides.find(key);
|
@@ -3404,11 +3534,15 @@ struct llama_model_loader {
|
|
3404
3534
|
return get_tensor_meta(get_tensor_name(i));
|
3405
3535
|
}
|
3406
3536
|
|
3407
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3537
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
3408
3538
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3409
3539
|
ggml_set_name(tensor, ggml_get_name(cur));
|
3410
3540
|
|
3411
|
-
|
3541
|
+
if (duplicated) {
|
3542
|
+
size_data += ggml_nbytes(cur);
|
3543
|
+
} else {
|
3544
|
+
n_created++;
|
3545
|
+
}
|
3412
3546
|
|
3413
3547
|
return tensor;
|
3414
3548
|
}
|
@@ -3443,14 +3577,17 @@ struct llama_model_loader {
|
|
3443
3577
|
return cur;
|
3444
3578
|
}
|
3445
3579
|
|
3446
|
-
|
3447
|
-
|
3580
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
3581
|
+
static const int TENSOR_DUPLICATED = 2;
|
3582
|
+
|
3583
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
3584
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
3448
3585
|
|
3449
3586
|
if (cur == NULL) {
|
3450
3587
|
return NULL;
|
3451
3588
|
}
|
3452
3589
|
|
3453
|
-
return create_tensor_for(ctx, cur);
|
3590
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
3454
3591
|
}
|
3455
3592
|
|
3456
3593
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
@@ -3750,37 +3887,50 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3750
3887
|
|
3751
3888
|
static const char * llama_model_type_name(e_model type) {
|
3752
3889
|
switch (type) {
|
3753
|
-
case
|
3754
|
-
case
|
3755
|
-
case
|
3756
|
-
case
|
3757
|
-
case
|
3758
|
-
case
|
3759
|
-
case
|
3760
|
-
case
|
3761
|
-
case
|
3762
|
-
case
|
3763
|
-
case
|
3764
|
-
case
|
3765
|
-
case
|
3766
|
-
case
|
3767
|
-
case
|
3768
|
-
case
|
3769
|
-
case
|
3770
|
-
case
|
3771
|
-
case
|
3772
|
-
case
|
3773
|
-
case
|
3774
|
-
case
|
3775
|
-
case
|
3776
|
-
case
|
3777
|
-
case
|
3778
|
-
case
|
3779
|
-
case
|
3780
|
-
case
|
3781
|
-
case
|
3782
|
-
case
|
3783
|
-
|
3890
|
+
case MODEL_14M: return "14M";
|
3891
|
+
case MODEL_17M: return "17M";
|
3892
|
+
case MODEL_22M: return "22M";
|
3893
|
+
case MODEL_33M: return "33M";
|
3894
|
+
case MODEL_70M: return "70M";
|
3895
|
+
case MODEL_109M: return "109M";
|
3896
|
+
case MODEL_137M: return "137M";
|
3897
|
+
case MODEL_160M: return "160M";
|
3898
|
+
case MODEL_335M: return "335M";
|
3899
|
+
case MODEL_410M: return "410M";
|
3900
|
+
case MODEL_0_5B: return "0.5B";
|
3901
|
+
case MODEL_1B: return "1B";
|
3902
|
+
case MODEL_1_4B: return "1.4B";
|
3903
|
+
case MODEL_2B: return "2B";
|
3904
|
+
case MODEL_2_8B: return "2.8B";
|
3905
|
+
case MODEL_3B: return "3B";
|
3906
|
+
case MODEL_4B: return "4B";
|
3907
|
+
case MODEL_6_9B: return "6.9B";
|
3908
|
+
case MODEL_7B: return "7B";
|
3909
|
+
case MODEL_8B: return "8B";
|
3910
|
+
case MODEL_12B: return "12B";
|
3911
|
+
case MODEL_13B: return "13B";
|
3912
|
+
case MODEL_14B: return "14B";
|
3913
|
+
case MODEL_15B: return "15B";
|
3914
|
+
case MODEL_16B: return "16B";
|
3915
|
+
case MODEL_20B: return "20B";
|
3916
|
+
case MODEL_30B: return "30B";
|
3917
|
+
case MODEL_34B: return "34B";
|
3918
|
+
case MODEL_35B: return "35B";
|
3919
|
+
case MODEL_40B: return "40B";
|
3920
|
+
case MODEL_65B: return "65B";
|
3921
|
+
case MODEL_70B: return "70B";
|
3922
|
+
case MODEL_236B: return "236B";
|
3923
|
+
case MODEL_314B: return "314B";
|
3924
|
+
case MODEL_SMALL: return "0.1B";
|
3925
|
+
case MODEL_MEDIUM: return "0.4B";
|
3926
|
+
case MODEL_LARGE: return "0.8B";
|
3927
|
+
case MODEL_XL: return "1.5B";
|
3928
|
+
case MODEL_A2_7B: return "A2.7B";
|
3929
|
+
case MODEL_8x7B: return "8x7B";
|
3930
|
+
case MODEL_8x22B: return "8x22B";
|
3931
|
+
case MODEL_16x12B: return "16x12B";
|
3932
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
3933
|
+
default: return "?B";
|
3784
3934
|
}
|
3785
3935
|
}
|
3786
3936
|
|
@@ -3873,6 +4023,8 @@ static void llm_load_hparams(
|
|
3873
4023
|
}
|
3874
4024
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
3875
4025
|
|
4026
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
4027
|
+
|
3876
4028
|
// sanity check for n_rot (optional)
|
3877
4029
|
{
|
3878
4030
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
@@ -3910,7 +4062,9 @@ static void llm_load_hparams(
|
|
3910
4062
|
switch (hparams.n_layer) {
|
3911
4063
|
case 22: model.type = e_model::MODEL_1B; break;
|
3912
4064
|
case 26: model.type = e_model::MODEL_3B; break;
|
3913
|
-
|
4065
|
+
// granite uses a vocab with len 49152
|
4066
|
+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
4067
|
+
case 36: model.type = e_model::MODEL_8B; break; // granite
|
3914
4068
|
case 40: model.type = e_model::MODEL_13B; break;
|
3915
4069
|
case 48: model.type = e_model::MODEL_34B; break;
|
3916
4070
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3972,14 +4126,6 @@ static void llm_load_hparams(
|
|
3972
4126
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3973
4127
|
}
|
3974
4128
|
} break;
|
3975
|
-
case LLM_ARCH_PERSIMMON:
|
3976
|
-
{
|
3977
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3978
|
-
switch (hparams.n_layer) {
|
3979
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
3980
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
3981
|
-
}
|
3982
|
-
} break;
|
3983
4129
|
case LLM_ARCH_REFACT:
|
3984
4130
|
{
|
3985
4131
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -4121,6 +4267,7 @@ static void llm_load_hparams(
|
|
4121
4267
|
switch (hparams.n_layer) {
|
4122
4268
|
case 24: model.type = e_model::MODEL_1B; break;
|
4123
4269
|
case 32: model.type = e_model::MODEL_3B; break;
|
4270
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
4124
4271
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4125
4272
|
}
|
4126
4273
|
} break;
|
@@ -4187,6 +4334,8 @@ static void llm_load_hparams(
|
|
4187
4334
|
case 30: model.type = e_model::MODEL_3B; break;
|
4188
4335
|
case 32: model.type = e_model::MODEL_7B; break;
|
4189
4336
|
case 40: model.type = e_model::MODEL_15B; break;
|
4337
|
+
case 52: model.type = e_model::MODEL_20B; break; // granite
|
4338
|
+
case 88: model.type = e_model::MODEL_34B; break; // granite
|
4190
4339
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4191
4340
|
}
|
4192
4341
|
} break;
|
@@ -4261,6 +4410,85 @@ static void llm_load_hparams(
|
|
4261
4410
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4262
4411
|
}
|
4263
4412
|
} break;
|
4413
|
+
case LLM_ARCH_GPTNEOX:
|
4414
|
+
{
|
4415
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4416
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
4417
|
+
switch (hparams.n_layer) {
|
4418
|
+
case 6:
|
4419
|
+
switch (hparams.n_ff) {
|
4420
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
4421
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
4422
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4423
|
+
} break;
|
4424
|
+
case 12:
|
4425
|
+
switch (hparams.n_ff) {
|
4426
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
4427
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4428
|
+
} break;
|
4429
|
+
case 16:
|
4430
|
+
switch (hparams.n_ff) {
|
4431
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
4432
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4433
|
+
} break;
|
4434
|
+
case 24:
|
4435
|
+
switch (hparams.n_ff) {
|
4436
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
4437
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
4438
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4439
|
+
} break;
|
4440
|
+
case 32:
|
4441
|
+
switch (hparams.n_ff) {
|
4442
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
4443
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
4444
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4445
|
+
} break;
|
4446
|
+
case 36:
|
4447
|
+
switch (hparams.n_ff) {
|
4448
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
4449
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4450
|
+
} break;
|
4451
|
+
case 44:
|
4452
|
+
switch (hparams.n_ff) {
|
4453
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
4454
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4455
|
+
} break;
|
4456
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4457
|
+
}
|
4458
|
+
} break;
|
4459
|
+
case LLM_ARCH_ARCTIC:
|
4460
|
+
{
|
4461
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4462
|
+
|
4463
|
+
if (hparams.n_expert == 128) {
|
4464
|
+
switch (hparams.n_layer) {
|
4465
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
4466
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4467
|
+
}
|
4468
|
+
} else {
|
4469
|
+
model.type = e_model::MODEL_UNKNOWN;
|
4470
|
+
}
|
4471
|
+
} break;
|
4472
|
+
case LLM_ARCH_DEEPSEEK2:
|
4473
|
+
{
|
4474
|
+
bool is_lite = (hparams.n_layer == 27);
|
4475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
4477
|
+
if (!is_lite) {
|
4478
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
4479
|
+
}
|
4480
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
4481
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
4482
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
4483
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
4484
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
4485
|
+
|
4486
|
+
switch (hparams.n_layer) {
|
4487
|
+
case 27: model.type = e_model::MODEL_16B; break;
|
4488
|
+
case 60: model.type = e_model::MODEL_236B; break;
|
4489
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4490
|
+
}
|
4491
|
+
} break;
|
4264
4492
|
default: (void)0;
|
4265
4493
|
}
|
4266
4494
|
|
@@ -4367,15 +4595,14 @@ static void llm_load_vocab(
|
|
4367
4595
|
vocab.special_cls_id = 101;
|
4368
4596
|
vocab.special_mask_id = 103;
|
4369
4597
|
vocab.add_space_prefix = false;
|
4370
|
-
} else {
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4374
|
-
|
4375
|
-
|
4376
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4377
|
-
return;
|
4598
|
+
} else if (tokenizer_model == "gpt2") {
|
4599
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4600
|
+
|
4601
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4602
|
+
if (add_space_prefix_keyidx != -1) {
|
4603
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4378
4604
|
}
|
4605
|
+
|
4379
4606
|
// read bpe merges and populate bpe ranks
|
4380
4607
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4381
4608
|
if (merges_keyidx == -1) {
|
@@ -4409,6 +4636,8 @@ static void llm_load_vocab(
|
|
4409
4636
|
vocab.special_pad_id = -1;
|
4410
4637
|
vocab.special_cls_id = -1;
|
4411
4638
|
vocab.special_mask_id = -1;
|
4639
|
+
} else {
|
4640
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
4412
4641
|
}
|
4413
4642
|
|
4414
4643
|
// for now, only BPE models have pre-tokenizers
|
@@ -4461,12 +4690,18 @@ static void llm_load_vocab(
|
|
4461
4690
|
} else if (
|
4462
4691
|
tokenizer_pre == "qwen2") {
|
4463
4692
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4693
|
+
} else if (
|
4694
|
+
tokenizer_pre == "stablelm2") {
|
4695
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
4464
4696
|
} else if (
|
4465
4697
|
tokenizer_pre == "olmo") {
|
4466
4698
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
4467
4699
|
} else if (
|
4468
4700
|
tokenizer_pre == "dbrx") {
|
4469
4701
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4702
|
+
} else if (
|
4703
|
+
tokenizer_pre == "smaug-bpe") {
|
4704
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4470
4705
|
} else {
|
4471
4706
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4472
4707
|
}
|
@@ -4582,7 +4817,8 @@ static void llm_load_vocab(
|
|
4582
4817
|
(t.first == "<|eot_id|>" ||
|
4583
4818
|
t.first == "<|im_end|>" ||
|
4584
4819
|
t.first == "<|end|>" ||
|
4585
|
-
t.first == "<end_of_turn>"
|
4820
|
+
t.first == "<end_of_turn>" ||
|
4821
|
+
t.first == "<|endoftext|>"
|
4586
4822
|
)
|
4587
4823
|
) {
|
4588
4824
|
vocab.special_eot_id = t.second;
|
@@ -4594,97 +4830,40 @@ static void llm_load_vocab(
|
|
4594
4830
|
|
4595
4831
|
// build special tokens cache
|
4596
4832
|
{
|
4597
|
-
|
4598
|
-
// and will always be correctly labeled in 'added_tokens.json' etc.
|
4599
|
-
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
4600
|
-
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
4601
|
-
// are special tokens.
|
4602
|
-
// From testing, this appears to correlate 1:1 with special tokens.
|
4603
|
-
//
|
4604
|
-
|
4605
|
-
// Counting special tokens and verifying in only one direction
|
4606
|
-
// is sufficient to detect difference in those two sets.
|
4607
|
-
//
|
4608
|
-
uint32_t special_tokens_count_by_type = 0;
|
4609
|
-
uint32_t special_tokens_count_from_verification = 0;
|
4610
|
-
|
4611
|
-
bool special_tokens_definition_mismatch = false;
|
4612
|
-
|
4613
|
-
for (const auto & t : vocab.token_to_id) {
|
4614
|
-
const auto & token = t.first;
|
4615
|
-
const auto & id = t.second;
|
4616
|
-
|
4617
|
-
// Count all non-normal tokens in the vocab while iterating
|
4833
|
+
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
4618
4834
|
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
4619
|
-
|
4835
|
+
vocab.cache_special_tokens.push_back(id);
|
4620
4836
|
}
|
4837
|
+
}
|
4621
4838
|
|
4622
|
-
|
4623
|
-
|
4624
|
-
|
4625
|
-
|
4626
|
-
|
4627
|
-
// and check if both halves can be matched to a valid token
|
4628
|
-
for (unsigned i = 1; i < token.length();) {
|
4629
|
-
const auto left = token.substr(0, i);
|
4630
|
-
const auto right = token.substr(i);
|
4631
|
-
|
4632
|
-
// check if we didnt partition in the middle of a utf sequence
|
4633
|
-
auto utf = utf8_len(left.at(left.length() - 1));
|
4634
|
-
|
4635
|
-
if (utf == 1) {
|
4636
|
-
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
4637
|
-
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
4638
|
-
is_tokenizable = true;
|
4639
|
-
break;
|
4640
|
-
}
|
4641
|
-
i++;
|
4642
|
-
} else {
|
4643
|
-
// skip over the rest of multibyte utf sequence
|
4644
|
-
i += utf - 1;
|
4645
|
-
}
|
4646
|
-
}
|
4839
|
+
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
4840
|
+
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
4841
|
+
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
4842
|
+
}
|
4843
|
+
);
|
4647
4844
|
|
4648
|
-
|
4649
|
-
|
4650
|
-
// it's faster to re-filter them here, since there are way less candidates now
|
4845
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
4846
|
+
}
|
4651
4847
|
|
4652
|
-
|
4653
|
-
|
4654
|
-
|
4655
|
-
utf8_str_len++;
|
4656
|
-
i += utf8_len(token.at(i));
|
4657
|
-
}
|
4848
|
+
// build token to piece caches
|
4849
|
+
{
|
4850
|
+
size_t size_cache = 0;
|
4658
4851
|
|
4659
|
-
|
4660
|
-
|
4661
|
-
// At this point what we have left are special tokens only
|
4662
|
-
vocab.special_tokens_cache[token] = id;
|
4852
|
+
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
4853
|
+
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
4663
4854
|
|
4664
|
-
|
4665
|
-
|
4855
|
+
for (uint32_t id = 0; id < n_vocab; ++id) {
|
4856
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
4857
|
+
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
4666
4858
|
|
4667
|
-
|
4668
|
-
|
4669
|
-
special_tokens_definition_mismatch = true;
|
4670
|
-
}
|
4671
|
-
}
|
4672
|
-
}
|
4673
|
-
}
|
4859
|
+
size_cache += cache_token_to_piece[id].size();
|
4860
|
+
size_cache += cache_token_to_piece_special[id].size();
|
4674
4861
|
}
|
4675
4862
|
|
4676
|
-
|
4677
|
-
|
4678
|
-
|
4679
|
-
|
4680
|
-
special_tokens_count_by_type, vocab.id_to_token.size()
|
4681
|
-
);
|
4682
|
-
} else {
|
4683
|
-
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
4684
|
-
__func__,
|
4685
|
-
special_tokens_count_from_verification, vocab.id_to_token.size()
|
4686
|
-
);
|
4687
|
-
}
|
4863
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
4864
|
+
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
4865
|
+
|
4866
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
4688
4867
|
}
|
4689
4868
|
}
|
4690
4869
|
|
@@ -4765,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4765
4944
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4766
4945
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4767
4946
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4947
|
+
|
4948
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
4949
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
4950
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
4951
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
4952
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4953
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
4954
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
4955
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
4956
|
+
}
|
4768
4957
|
}
|
4769
4958
|
|
4770
4959
|
// Returns false if cancelled by progress_callback
|
@@ -4908,6 +5097,7 @@ static bool llm_load_tensors(
|
|
4908
5097
|
// create tensors for the weights
|
4909
5098
|
{
|
4910
5099
|
const int64_t n_embd = hparams.n_embd;
|
5100
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
4911
5101
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4912
5102
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4913
5103
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -4920,8 +5110,6 @@ static bool llm_load_tensors(
|
|
4920
5110
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4921
5111
|
}
|
4922
5112
|
|
4923
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4924
|
-
|
4925
5113
|
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
4926
5114
|
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
4927
5115
|
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
@@ -4942,12 +5130,10 @@ static bool llm_load_tensors(
|
|
4942
5130
|
{
|
4943
5131
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4944
5132
|
if (model.arch != LLM_ARCH_MINICPM){
|
4945
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5133
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4946
5134
|
// if output is NULL, init from the input tok embed
|
4947
5135
|
if (model.output == NULL) {
|
4948
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4949
|
-
ml.n_created--; // artificial tensor
|
4950
|
-
ml.size_data += ggml_nbytes(model.output);
|
5136
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4951
5137
|
}
|
4952
5138
|
}
|
4953
5139
|
}
|
@@ -4966,10 +5152,10 @@ static bool llm_load_tensors(
|
|
4966
5152
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4967
5153
|
|
4968
5154
|
// optional bias tensors
|
4969
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
4970
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
4971
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
4972
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5155
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5156
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5157
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5158
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4973
5159
|
|
4974
5160
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4975
5161
|
|
@@ -4977,10 +5163,15 @@ static bool llm_load_tensors(
|
|
4977
5163
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4978
5164
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4979
5165
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5166
|
+
|
5167
|
+
// optional MLP bias
|
5168
|
+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5169
|
+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4980
5171
|
} else {
|
4981
5172
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4982
5173
|
|
4983
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5174
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4984
5175
|
if (layer.ffn_gate_exps) {
|
4985
5176
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4986
5177
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5022,12 +5213,10 @@ static bool llm_load_tensors(
|
|
5022
5213
|
// output
|
5023
5214
|
{
|
5024
5215
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5025
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5216
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5026
5217
|
// if output is NULL, init from the input tok embed
|
5027
5218
|
if (model.output == NULL) {
|
5028
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5029
|
-
ml.n_created--; // artificial tensor
|
5030
|
-
ml.size_data += ggml_nbytes(model.output);
|
5219
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5031
5220
|
}
|
5032
5221
|
}
|
5033
5222
|
|
@@ -5050,7 +5239,7 @@ static bool llm_load_tensors(
|
|
5050
5239
|
|
5051
5240
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5052
5241
|
|
5053
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5242
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5054
5243
|
if (layer.ffn_gate_exps) {
|
5055
5244
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
5056
5245
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5152,11 +5341,9 @@ static bool llm_load_tensors(
|
|
5152
5341
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5153
5342
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5154
5343
|
|
5155
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5344
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5156
5345
|
if (!model.output) {
|
5157
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5158
|
-
ml.n_created--; // artificial tensor
|
5159
|
-
ml.size_data += ggml_nbytes(model.output);
|
5346
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5160
5347
|
}
|
5161
5348
|
}
|
5162
5349
|
|
@@ -5169,8 +5356,8 @@ static bool llm_load_tensors(
|
|
5169
5356
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5170
5357
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5171
5358
|
|
5172
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
5173
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
5359
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5360
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5174
5361
|
|
5175
5362
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5176
5363
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -5188,7 +5375,12 @@ static bool llm_load_tensors(
|
|
5188
5375
|
{
|
5189
5376
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5190
5377
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5191
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5378
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5379
|
+
if (!model.output) {
|
5380
|
+
// needs to be on GPU
|
5381
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5382
|
+
}
|
5383
|
+
|
5192
5384
|
}
|
5193
5385
|
|
5194
5386
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5216,47 +5408,6 @@ static bool llm_load_tensors(
|
|
5216
5408
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5217
5409
|
}
|
5218
5410
|
} break;
|
5219
|
-
case LLM_ARCH_PERSIMMON:
|
5220
|
-
{
|
5221
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5222
|
-
|
5223
|
-
{
|
5224
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5225
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5226
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5227
|
-
}
|
5228
|
-
|
5229
|
-
for (int i = 0; i < n_layer; ++i) {
|
5230
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
5231
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5232
|
-
|
5233
|
-
auto & layer = model.layers[i];
|
5234
|
-
|
5235
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5236
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5237
|
-
|
5238
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5239
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
5240
|
-
|
5241
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5242
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
5243
|
-
|
5244
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5245
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5246
|
-
|
5247
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5248
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5249
|
-
|
5250
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5251
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
5252
|
-
|
5253
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
5254
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
5255
|
-
|
5256
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
5257
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
5258
|
-
}
|
5259
|
-
} break;
|
5260
5411
|
case LLM_ARCH_BERT:
|
5261
5412
|
case LLM_ARCH_NOMIC_BERT:
|
5262
5413
|
{
|
@@ -5325,14 +5476,14 @@ static bool llm_load_tensors(
|
|
5325
5476
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
5477
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
5478
|
|
5328
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5329
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5479
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5480
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5330
5481
|
|
5331
5482
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
5483
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
5484
|
|
5334
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5335
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5485
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5486
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5336
5487
|
|
5337
5488
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
5489
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
@@ -5394,18 +5545,16 @@ static bool llm_load_tensors(
|
|
5394
5545
|
case LLM_ARCH_MPT:
|
5395
5546
|
{
|
5396
5547
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5397
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
5548
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5398
5549
|
|
5399
5550
|
// output
|
5400
5551
|
{
|
5401
5552
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5402
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
5553
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5403
5554
|
|
5404
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5555
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5405
5556
|
if (!model.output) {
|
5406
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5407
|
-
ml.n_created--; // artificial tensor
|
5408
|
-
ml.size_data += ggml_nbytes(model.output);
|
5557
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5409
5558
|
}
|
5410
5559
|
}
|
5411
5560
|
|
@@ -5416,31 +5565,31 @@ static bool llm_load_tensors(
|
|
5416
5565
|
auto & layer = model.layers[i];
|
5417
5566
|
|
5418
5567
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5419
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
5568
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5420
5569
|
|
5421
5570
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5422
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5571
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5423
5572
|
|
5424
5573
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5425
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5574
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5426
5575
|
|
5427
5576
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5428
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5577
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5429
5578
|
|
5430
5579
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5431
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
5580
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5432
5581
|
|
5433
5582
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5434
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
5583
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5435
5584
|
|
5436
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5437
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5585
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5586
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5438
5587
|
|
5439
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5440
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5588
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5589
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5441
5590
|
|
5442
5591
|
// AWQ ScaleActivation layer
|
5443
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
5592
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5444
5593
|
}
|
5445
5594
|
} break;
|
5446
5595
|
case LLM_ARCH_STABLELM:
|
@@ -5469,17 +5618,17 @@ static bool llm_load_tensors(
|
|
5469
5618
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5470
5619
|
|
5471
5620
|
// optional bias tensors, present in Stable LM 2 1.6B
|
5472
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
5473
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
5474
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
5621
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5622
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5623
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5475
5624
|
|
5476
5625
|
// optional q and k layernorms, present in StableLM 2 12B
|
5477
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
5478
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
5626
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5627
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5479
5628
|
|
5480
5629
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5481
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
5482
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5630
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5631
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5483
5632
|
|
5484
5633
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5485
5634
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5522,12 +5671,10 @@ static bool llm_load_tensors(
|
|
5522
5671
|
// output
|
5523
5672
|
{
|
5524
5673
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5525
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5674
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5526
5675
|
// if output is NULL, init from the input tok embed
|
5527
5676
|
if (model.output == NULL) {
|
5528
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5529
|
-
ml.n_created--; // artificial tensor
|
5530
|
-
ml.size_data += ggml_nbytes(model.output);
|
5677
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5531
5678
|
}
|
5532
5679
|
}
|
5533
5680
|
|
@@ -5625,8 +5772,8 @@ static bool llm_load_tensors(
|
|
5625
5772
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5626
5773
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5627
5774
|
|
5628
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
5629
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5775
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5776
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5630
5777
|
|
5631
5778
|
if (layer.wqkv == nullptr) {
|
5632
5779
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
@@ -5663,17 +5810,20 @@ static bool llm_load_tensors(
|
|
5663
5810
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
5664
5811
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5665
5812
|
|
5666
|
-
auto& layer = model.layers[i];
|
5813
|
+
auto & layer = model.layers[i];
|
5667
5814
|
|
5668
5815
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5669
5816
|
|
5670
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
5671
|
-
layer.wo
|
5817
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5818
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5672
5819
|
|
5673
5820
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5674
5821
|
|
5675
5822
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5676
5823
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5824
|
+
|
5825
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5826
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5677
5827
|
}
|
5678
5828
|
} break;
|
5679
5829
|
case LLM_ARCH_PLAMO:
|
@@ -5842,9 +5992,7 @@ static bool llm_load_tensors(
|
|
5842
5992
|
|
5843
5993
|
// output
|
5844
5994
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5845
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
5846
|
-
ml.n_created--; // artificial tensor
|
5847
|
-
ml.size_data += ggml_nbytes(model.output);
|
5995
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
5848
5996
|
|
5849
5997
|
const int64_t n_ff = hparams.n_ff;
|
5850
5998
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -5879,12 +6027,10 @@ static bool llm_load_tensors(
|
|
5879
6027
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5880
6028
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5881
6029
|
|
5882
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6030
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5883
6031
|
// if output is NULL, init from the input tok embed
|
5884
6032
|
if (model.output == NULL) {
|
5885
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5886
|
-
ml.n_created--; // artificial tensor
|
5887
|
-
ml.size_data += ggml_nbytes(model.output);
|
6033
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5888
6034
|
}
|
5889
6035
|
|
5890
6036
|
}
|
@@ -5935,12 +6081,10 @@ static bool llm_load_tensors(
|
|
5935
6081
|
{
|
5936
6082
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5937
6083
|
|
5938
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6084
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5939
6085
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
5940
6086
|
if (model.output == NULL) {
|
5941
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5942
|
-
ml.n_created--; // artificial tensor
|
5943
|
-
ml.size_data += ggml_nbytes(model.output);
|
6087
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5944
6088
|
}
|
5945
6089
|
}
|
5946
6090
|
|
@@ -6001,9 +6145,7 @@ static bool llm_load_tensors(
|
|
6001
6145
|
{
|
6002
6146
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6003
6147
|
// init output from the input tok embed
|
6004
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6005
|
-
ml.n_created--; // artificial tensor
|
6006
|
-
ml.size_data += ggml_nbytes(model.output);
|
6148
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6007
6149
|
}
|
6008
6150
|
|
6009
6151
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -6035,12 +6177,10 @@ static bool llm_load_tensors(
|
|
6035
6177
|
|
6036
6178
|
// output
|
6037
6179
|
{
|
6038
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6180
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6039
6181
|
// if output is NULL, init from the input tok embed
|
6040
6182
|
if (model.output == NULL) {
|
6041
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6042
|
-
ml.n_created--; // artificial tensor
|
6043
|
-
ml.size_data += ggml_nbytes(model.output);
|
6183
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6044
6184
|
}
|
6045
6185
|
}
|
6046
6186
|
|
@@ -6060,30 +6200,169 @@ static bool llm_load_tensors(
|
|
6060
6200
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6061
6201
|
}
|
6062
6202
|
} break;
|
6063
|
-
|
6064
|
-
|
6065
|
-
|
6066
|
-
|
6203
|
+
case LLM_ARCH_GPTNEOX:
|
6204
|
+
{
|
6205
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6206
|
+
// output
|
6207
|
+
{
|
6208
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6209
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
6210
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6211
|
+
}
|
6067
6212
|
|
6068
|
-
|
6213
|
+
for (int i = 0; i < n_layer; ++i) {
|
6214
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6215
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6069
6216
|
|
6070
|
-
|
6071
|
-
model.mappings.reserve(ml.mappings.size());
|
6217
|
+
auto & layer = model.layers[i];
|
6072
6218
|
|
6073
|
-
|
6074
|
-
|
6075
|
-
ctx_bufs.reserve(ctx_map.size());
|
6219
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6220
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
6076
6221
|
|
6077
|
-
|
6078
|
-
|
6079
|
-
model.bufs.reserve(n_max_backend_buffer);
|
6222
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
6223
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
6080
6224
|
|
6081
|
-
|
6082
|
-
|
6083
|
-
ggml_context * ctx = it.second;
|
6225
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6226
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
6084
6227
|
|
6085
|
-
|
6086
|
-
|
6228
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6229
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
6230
|
+
|
6231
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
6232
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
6233
|
+
|
6234
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6235
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
6236
|
+
}
|
6237
|
+
} break;
|
6238
|
+
case LLM_ARCH_ARCTIC:
|
6239
|
+
{
|
6240
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6241
|
+
|
6242
|
+
// output
|
6243
|
+
{
|
6244
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6245
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6246
|
+
// if output is NULL, init from the input tok embed
|
6247
|
+
if (model.output == NULL) {
|
6248
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6249
|
+
}
|
6250
|
+
}
|
6251
|
+
|
6252
|
+
for (int i = 0; i < n_layer; ++i) {
|
6253
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6254
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6255
|
+
|
6256
|
+
auto & layer = model.layers[i];
|
6257
|
+
|
6258
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6259
|
+
|
6260
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
6261
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
6262
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
6263
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6264
|
+
|
6265
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6266
|
+
|
6267
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
6268
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
6269
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
6270
|
+
|
6271
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6272
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
6273
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
6274
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
6275
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6276
|
+
}
|
6277
|
+
} break;
|
6278
|
+
case LLM_ARCH_DEEPSEEK2:
|
6279
|
+
{
|
6280
|
+
bool is_lite = (hparams.n_layer == 27);
|
6281
|
+
|
6282
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
6283
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6284
|
+
const uint32_t q_lora_rank = hparams.n_lora_q;
|
6285
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
6286
|
+
const uint32_t n_ff_exp = hparams.n_ff_exp;
|
6287
|
+
|
6288
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6289
|
+
|
6290
|
+
// output
|
6291
|
+
{
|
6292
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6293
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6294
|
+
}
|
6295
|
+
|
6296
|
+
for (int i = 0; i < n_layer; ++i) {
|
6297
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6298
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6299
|
+
|
6300
|
+
auto & layer = model.layers[i];
|
6301
|
+
|
6302
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6303
|
+
if (!is_lite) {
|
6304
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
6305
|
+
}
|
6306
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
6307
|
+
|
6308
|
+
if (!is_lite) {
|
6309
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
6310
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
|
6311
|
+
} else {
|
6312
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
6313
|
+
}
|
6314
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
6315
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
|
6316
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
|
6317
|
+
|
6318
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6319
|
+
|
6320
|
+
if ((uint32_t) i < hparams.n_layer_dense_lead) {
|
6321
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
6322
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
6323
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6324
|
+
} else {
|
6325
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6326
|
+
|
6327
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
6328
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
6329
|
+
|
6330
|
+
// MoE branch
|
6331
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6332
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
6333
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6334
|
+
|
6335
|
+
// Shared expert branch
|
6336
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6337
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
|
6338
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6339
|
+
}
|
6340
|
+
}
|
6341
|
+
} break;
|
6342
|
+
default:
|
6343
|
+
throw std::runtime_error("unknown architecture");
|
6344
|
+
}
|
6345
|
+
}
|
6346
|
+
|
6347
|
+
ml.done_getting_tensors();
|
6348
|
+
|
6349
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
6350
|
+
model.mappings.reserve(ml.mappings.size());
|
6351
|
+
|
6352
|
+
// create the backend buffers
|
6353
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
6354
|
+
ctx_bufs.reserve(ctx_map.size());
|
6355
|
+
|
6356
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
6357
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
6358
|
+
model.bufs.reserve(n_max_backend_buffer);
|
6359
|
+
|
6360
|
+
for (auto & it : ctx_map) {
|
6361
|
+
ggml_backend_buffer_type_t buft = it.first;
|
6362
|
+
ggml_context * ctx = it.second;
|
6363
|
+
|
6364
|
+
llama_buf_map bufs;
|
6365
|
+
bufs.reserve(n_max_backend_buffer);
|
6087
6366
|
|
6088
6367
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
6089
6368
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
@@ -6324,10 +6603,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6324
6603
|
|
6325
6604
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
6326
6605
|
} else {
|
6327
|
-
|
6328
|
-
GGML_ASSERT(false && "not implemented");
|
6329
|
-
#endif
|
6330
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6606
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6331
6607
|
inpL = lctx.inp_embd;
|
6332
6608
|
ggml_set_input(lctx.inp_embd);
|
6333
6609
|
}
|
@@ -6517,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6517
6793
|
int64_t n_expert_used,
|
6518
6794
|
llm_ffn_op_type type_op,
|
6519
6795
|
bool norm_w,
|
6796
|
+
bool scale_w,
|
6797
|
+
float w_scale,
|
6520
6798
|
const llm_build_cb & cb,
|
6521
6799
|
int il) {
|
6522
6800
|
int64_t n_embd = cur->ne[0];
|
@@ -6548,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6548
6826
|
|
6549
6827
|
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6550
6828
|
}
|
6829
|
+
if (scale_w) {
|
6830
|
+
weights = ggml_scale(ctx, weights, w_scale);
|
6831
|
+
cb(weights, "ffn_moe_weights_scaled", il);
|
6832
|
+
}
|
6551
6833
|
|
6552
6834
|
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6553
6835
|
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
@@ -6652,7 +6934,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6652
6934
|
|
6653
6935
|
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6654
6936
|
|
6655
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6937
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6656
6938
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6657
6939
|
}
|
6658
6940
|
|
@@ -6661,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6661
6943
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6662
6944
|
cb(kq, "kq", il);
|
6663
6945
|
|
6664
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6946
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6665
6947
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6666
6948
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6667
6949
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6886,17 +7168,20 @@ struct llm_build_context {
|
|
6886
7168
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
6887
7169
|
ggml_set_input(lctx.inp_K_shift);
|
6888
7170
|
|
7171
|
+
|
6889
7172
|
for (int il = 0; il < n_layer; ++il) {
|
7173
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
6890
7174
|
struct ggml_tensor * tmp =
|
6891
7175
|
// we rotate only the first n_rot dimensions
|
6892
|
-
|
7176
|
+
ggml_rope_ext_inplace(ctx0,
|
6893
7177
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
6894
7178
|
n_embd_head_k, n_head_kv, n_ctx,
|
6895
7179
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
6896
7180
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6897
7181
|
0),
|
6898
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7182
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6899
7183
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7184
|
+
|
6900
7185
|
cb(tmp, "K_shifted", il);
|
6901
7186
|
ggml_build_forward_expand(gf, tmp);
|
6902
7187
|
}
|
@@ -6999,6 +7284,17 @@ struct llm_build_context {
|
|
6999
7284
|
return lctx.inp_pos;
|
7000
7285
|
}
|
7001
7286
|
|
7287
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
7288
|
+
// choose long/short freq factors based on the context size
|
7289
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7290
|
+
|
7291
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
7292
|
+
return model.layers[il].rope_long;
|
7293
|
+
}
|
7294
|
+
|
7295
|
+
return model.layers[il].rope_short;
|
7296
|
+
}
|
7297
|
+
|
7002
7298
|
struct ggml_tensor * build_inp_out_ids() {
|
7003
7299
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
7004
7300
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
@@ -7106,15 +7402,15 @@ struct llm_build_context {
|
|
7106
7402
|
cb(Vcur, "Vcur", il);
|
7107
7403
|
}
|
7108
7404
|
|
7109
|
-
Qcur =
|
7110
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7405
|
+
Qcur = ggml_rope_ext(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7111
7407
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7112
7408
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7113
7409
|
);
|
7114
7410
|
cb(Qcur, "Qcur", il);
|
7115
7411
|
|
7116
|
-
Kcur =
|
7117
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7412
|
+
Kcur = ggml_rope_ext(
|
7413
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7118
7414
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7119
7415
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7120
7416
|
);
|
@@ -7144,9 +7440,9 @@ struct llm_build_context {
|
|
7144
7440
|
cb(cur, "ffn_norm", il);
|
7145
7441
|
|
7146
7442
|
cur = llm_build_ffn(ctx0, cur,
|
7147
|
-
model.layers[il].ffn_up,
|
7148
|
-
model.layers[il].ffn_gate,
|
7149
|
-
model.layers[il].ffn_down,
|
7443
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7444
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
7445
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7150
7446
|
NULL,
|
7151
7447
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7152
7448
|
cb(cur, "ffn_out", il);
|
@@ -7164,6 +7460,7 @@ struct llm_build_context {
|
|
7164
7460
|
model.layers[il].ffn_down_exps,
|
7165
7461
|
n_expert, n_expert_used,
|
7166
7462
|
LLM_FFN_SILU, true,
|
7463
|
+
false, 0.0,
|
7167
7464
|
cb, il);
|
7168
7465
|
cb(cur, "ffn_moe_out", il);
|
7169
7466
|
}
|
@@ -7236,13 +7533,13 @@ struct llm_build_context {
|
|
7236
7533
|
|
7237
7534
|
switch (model.type) {
|
7238
7535
|
case MODEL_7B:
|
7239
|
-
Qcur =
|
7240
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7536
|
+
Qcur = ggml_rope_ext(
|
7537
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7241
7538
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7242
7539
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7243
7540
|
);
|
7244
|
-
Kcur =
|
7245
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7541
|
+
Kcur = ggml_rope_ext(
|
7542
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7246
7543
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7544
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7545
|
);
|
@@ -7348,15 +7645,15 @@ struct llm_build_context {
|
|
7348
7645
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7349
7646
|
cb(Vcur, "Vcur", il);
|
7350
7647
|
|
7351
|
-
Qcur =
|
7352
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7648
|
+
Qcur = ggml_rope_ext(
|
7649
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7353
7650
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7354
7651
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7355
7652
|
);
|
7356
7653
|
cb(Qcur, "Qcur", il);
|
7357
7654
|
|
7358
|
-
Kcur =
|
7359
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7655
|
+
Kcur = ggml_rope_ext(
|
7656
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7360
7657
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7361
7658
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7362
7659
|
);
|
@@ -7469,14 +7766,14 @@ struct llm_build_context {
|
|
7469
7766
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7470
7767
|
|
7471
7768
|
// using mode = 2 for neox mode
|
7472
|
-
Qcur =
|
7473
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7769
|
+
Qcur = ggml_rope_ext(
|
7770
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7474
7771
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7475
7772
|
);
|
7476
7773
|
cb(Qcur, "Qcur", il);
|
7477
7774
|
|
7478
|
-
Kcur =
|
7479
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7775
|
+
Kcur = ggml_rope_ext(
|
7776
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7480
7777
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7481
7778
|
);
|
7482
7779
|
cb(Kcur, "Kcur", il);
|
@@ -7592,15 +7889,15 @@ struct llm_build_context {
|
|
7592
7889
|
cb(Vcur, "Vcur", il);
|
7593
7890
|
}
|
7594
7891
|
|
7595
|
-
Qcur =
|
7596
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7892
|
+
Qcur = ggml_rope_ext(
|
7893
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7597
7894
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7598
7895
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7599
7896
|
);
|
7600
7897
|
cb(Qcur, "Qcur", il);
|
7601
7898
|
|
7602
|
-
Kcur =
|
7603
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7899
|
+
Kcur = ggml_rope_ext(
|
7900
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7604
7901
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7605
7902
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7606
7903
|
);
|
@@ -7645,6 +7942,7 @@ struct llm_build_context {
|
|
7645
7942
|
model.layers[il].ffn_down_exps,
|
7646
7943
|
n_expert, n_expert_used,
|
7647
7944
|
LLM_FFN_GELU, true,
|
7945
|
+
false, 0.0,
|
7648
7946
|
cb, il);
|
7649
7947
|
cb(cur, "ffn_moe_out", il);
|
7650
7948
|
|
@@ -7744,15 +8042,15 @@ struct llm_build_context {
|
|
7744
8042
|
cb(Kcur, "Kcur", il);
|
7745
8043
|
cb(Vcur, "Vcur", il);
|
7746
8044
|
|
7747
|
-
Qcur =
|
7748
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8045
|
+
Qcur = ggml_rope_ext(
|
8046
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7749
8047
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7750
8048
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7751
8049
|
);
|
7752
8050
|
cb(Qcur, "Qcur", il);
|
7753
8051
|
|
7754
|
-
Kcur =
|
7755
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8052
|
+
Kcur = ggml_rope_ext(
|
8053
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7756
8054
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7757
8055
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7758
8056
|
);
|
@@ -7788,6 +8086,7 @@ struct llm_build_context {
|
|
7788
8086
|
model.layers[il].ffn_down_exps,
|
7789
8087
|
n_expert, n_expert_used,
|
7790
8088
|
LLM_FFN_SILU, true,
|
8089
|
+
false, 0.0,
|
7791
8090
|
cb, il);
|
7792
8091
|
cb(cur, "ffn_moe_out", il);
|
7793
8092
|
|
@@ -7921,213 +8220,6 @@ struct llm_build_context {
|
|
7921
8220
|
return gf;
|
7922
8221
|
}
|
7923
8222
|
|
7924
|
-
struct ggml_cgraph * build_persimmon() {
|
7925
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7926
|
-
|
7927
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7928
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7929
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
7930
|
-
|
7931
|
-
struct ggml_tensor * cur;
|
7932
|
-
struct ggml_tensor * inpL;
|
7933
|
-
|
7934
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7935
|
-
|
7936
|
-
// inp_pos - contains the positions
|
7937
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
7938
|
-
|
7939
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7940
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7941
|
-
|
7942
|
-
for (int il = 0; il < n_layer; ++il) {
|
7943
|
-
struct ggml_tensor * residual = inpL;
|
7944
|
-
|
7945
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
7946
|
-
model.layers[il].attn_norm,
|
7947
|
-
model.layers[il].attn_norm_b,
|
7948
|
-
LLM_NORM, cb, il);
|
7949
|
-
cb(cur, "attn_norm", il);
|
7950
|
-
|
7951
|
-
// self attention
|
7952
|
-
{
|
7953
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7954
|
-
cb(cur, "wqkv", il);
|
7955
|
-
|
7956
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
7957
|
-
cb(cur, "bqkv", il);
|
7958
|
-
|
7959
|
-
// split qkv
|
7960
|
-
GGML_ASSERT(n_head_kv == n_head);
|
7961
|
-
|
7962
|
-
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
7963
|
-
cb(tmpqkv, "tmpqkv", il);
|
7964
|
-
|
7965
|
-
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
7966
|
-
cb(tmpqkv_perm, "tmpqkv", il);
|
7967
|
-
|
7968
|
-
struct ggml_tensor * tmpq = ggml_view_3d(
|
7969
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
7970
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
7971
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
7972
|
-
0
|
7973
|
-
);
|
7974
|
-
cb(tmpq, "tmpq", il);
|
7975
|
-
|
7976
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
7977
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
7978
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
7979
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
7980
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
7981
|
-
);
|
7982
|
-
cb(tmpk, "tmpk", il);
|
7983
|
-
|
7984
|
-
// Q/K Layernorm
|
7985
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
7986
|
-
model.layers[il].attn_q_norm,
|
7987
|
-
model.layers[il].attn_q_norm_b,
|
7988
|
-
LLM_NORM, cb, il);
|
7989
|
-
cb(tmpq, "tmpq", il);
|
7990
|
-
|
7991
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
7992
|
-
model.layers[il].attn_k_norm,
|
7993
|
-
model.layers[il].attn_k_norm_b,
|
7994
|
-
LLM_NORM, cb, il);
|
7995
|
-
cb(tmpk, "tmpk", il);
|
7996
|
-
|
7997
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
7998
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
7999
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8000
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8001
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8002
|
-
0
|
8003
|
-
);
|
8004
|
-
cb(qrot, "qrot", il);
|
8005
|
-
|
8006
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
8007
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8008
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8009
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8010
|
-
0
|
8011
|
-
);
|
8012
|
-
cb(krot, "krot", il);
|
8013
|
-
|
8014
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
8015
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
8016
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8017
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8018
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8019
|
-
ggml_element_size(tmpq) * n_rot
|
8020
|
-
);
|
8021
|
-
cb(qpass, "qpass", il);
|
8022
|
-
|
8023
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
8024
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8025
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8026
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8027
|
-
ggml_element_size(tmpk) * n_rot
|
8028
|
-
);
|
8029
|
-
cb(kpass, "kpass", il);
|
8030
|
-
|
8031
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
8032
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8033
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8034
|
-
);
|
8035
|
-
cb(qrotated, "qrotated", il);
|
8036
|
-
|
8037
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
8038
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8039
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8040
|
-
);
|
8041
|
-
cb(krotated, "krotated", il);
|
8042
|
-
|
8043
|
-
// ggml currently only supports concatenation on dim=2
|
8044
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
8045
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
8046
|
-
cb(qrotated, "qrotated", il);
|
8047
|
-
|
8048
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
8049
|
-
cb(krotated, "krotated", il);
|
8050
|
-
|
8051
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
8052
|
-
cb(qpass, "qpass", il);
|
8053
|
-
|
8054
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
8055
|
-
cb(kpass, "kpass", il);
|
8056
|
-
|
8057
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
8058
|
-
cb(Qcur, "Qcur", il);
|
8059
|
-
|
8060
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
8061
|
-
cb(Kcur, "Kcur", il);
|
8062
|
-
|
8063
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
8064
|
-
cb(Q, "Q", il);
|
8065
|
-
|
8066
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
8067
|
-
cb(Kcur, "Kcur", il);
|
8068
|
-
|
8069
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
8070
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
8071
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
8072
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
8073
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
8074
|
-
);
|
8075
|
-
cb(Vcur, "Vcur", il);
|
8076
|
-
|
8077
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8078
|
-
model.layers[il].wo, model.layers[il].bo,
|
8079
|
-
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8080
|
-
}
|
8081
|
-
|
8082
|
-
if (il == n_layer - 1) {
|
8083
|
-
// skip computing output for unused tokens
|
8084
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8085
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8086
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8087
|
-
}
|
8088
|
-
|
8089
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
8090
|
-
cb(ffn_inp, "ffn_inp", il);
|
8091
|
-
|
8092
|
-
// feed-forward network
|
8093
|
-
{
|
8094
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8095
|
-
model.layers[il].ffn_norm,
|
8096
|
-
model.layers[il].ffn_norm_b,
|
8097
|
-
LLM_NORM, cb, il);
|
8098
|
-
cb(cur, "ffn_norm", il);
|
8099
|
-
|
8100
|
-
cur = llm_build_ffn(ctx0, cur,
|
8101
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8102
|
-
NULL, NULL,
|
8103
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8104
|
-
NULL,
|
8105
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
8106
|
-
cb(cur, "ffn_out", il);
|
8107
|
-
}
|
8108
|
-
|
8109
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
8110
|
-
cb(cur, "l_out", il);
|
8111
|
-
|
8112
|
-
inpL = cur;
|
8113
|
-
}
|
8114
|
-
|
8115
|
-
cur = inpL;
|
8116
|
-
|
8117
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
8118
|
-
model.output_norm,
|
8119
|
-
model.output_norm_b,
|
8120
|
-
LLM_NORM, cb, -1);
|
8121
|
-
cb(cur, "result_norm", -1);
|
8122
|
-
|
8123
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8124
|
-
cb(cur, "result_output", -1);
|
8125
|
-
|
8126
|
-
ggml_build_forward_expand(gf, cur);
|
8127
|
-
|
8128
|
-
return gf;
|
8129
|
-
}
|
8130
|
-
|
8131
8223
|
struct ggml_cgraph * build_refact() {
|
8132
8224
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8133
8225
|
|
@@ -8304,15 +8396,15 @@ struct llm_build_context {
|
|
8304
8396
|
cb(Kcur, "Kcur", il);
|
8305
8397
|
cb(Vcur, "Vcur", il);
|
8306
8398
|
|
8307
|
-
Qcur =
|
8308
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8399
|
+
Qcur = ggml_rope_ext(
|
8400
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8309
8401
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8310
8402
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8311
8403
|
);
|
8312
8404
|
cb(Qcur, "Qcur", il);
|
8313
8405
|
|
8314
|
-
Kcur =
|
8315
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8406
|
+
Kcur = ggml_rope_ext(
|
8407
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8316
8408
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8317
8409
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8318
8410
|
);
|
@@ -8744,15 +8836,15 @@ struct llm_build_context {
|
|
8744
8836
|
}
|
8745
8837
|
|
8746
8838
|
|
8747
|
-
Qcur =
|
8748
|
-
ctx0, Qcur, inp_pos,
|
8839
|
+
Qcur = ggml_rope_ext(
|
8840
|
+
ctx0, Qcur, inp_pos, nullptr,
|
8749
8841
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8750
8842
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8751
8843
|
);
|
8752
8844
|
cb(Qcur, "Qcur", il);
|
8753
8845
|
|
8754
|
-
Kcur =
|
8755
|
-
ctx0, Kcur, inp_pos,
|
8846
|
+
Kcur = ggml_rope_ext(
|
8847
|
+
ctx0, Kcur, inp_pos, nullptr,
|
8756
8848
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8757
8849
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8758
8850
|
);
|
@@ -8864,14 +8956,14 @@ struct llm_build_context {
|
|
8864
8956
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8865
8957
|
|
8866
8958
|
// using mode = 2 for neox mode
|
8867
|
-
Qcur =
|
8868
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8959
|
+
Qcur = ggml_rope_ext(
|
8960
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8869
8961
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8870
8962
|
);
|
8871
8963
|
cb(Qcur, "Qcur", il);
|
8872
8964
|
|
8873
|
-
Kcur =
|
8874
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8965
|
+
Kcur = ggml_rope_ext(
|
8966
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8875
8967
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8876
8968
|
);
|
8877
8969
|
cb(Kcur, "Kcur", il);
|
@@ -8975,15 +9067,15 @@ struct llm_build_context {
|
|
8975
9067
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8976
9068
|
cb(Vcur, "Vcur", il);
|
8977
9069
|
|
8978
|
-
Qcur =
|
8979
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9070
|
+
Qcur = ggml_rope_ext(
|
9071
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8980
9072
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8981
9073
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8982
9074
|
);
|
8983
9075
|
cb(Qcur, "Qcur", il);
|
8984
9076
|
|
8985
|
-
Kcur =
|
8986
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9077
|
+
Kcur = ggml_rope_ext(
|
9078
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8987
9079
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8988
9080
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8989
9081
|
);
|
@@ -9089,15 +9181,15 @@ struct llm_build_context {
|
|
9089
9181
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9090
9182
|
cb(Vcur, "Vcur", il);
|
9091
9183
|
|
9092
|
-
Qcur =
|
9093
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9184
|
+
Qcur = ggml_rope_ext(
|
9185
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9094
9186
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9095
9187
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9096
9188
|
);
|
9097
9189
|
cb(Qcur, "Qcur", il);
|
9098
9190
|
|
9099
|
-
Kcur =
|
9100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9191
|
+
Kcur = ggml_rope_ext(
|
9192
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9101
9193
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9102
9194
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9103
9195
|
);
|
@@ -9133,6 +9225,7 @@ struct llm_build_context {
|
|
9133
9225
|
model.layers[il].ffn_down_exps,
|
9134
9226
|
n_expert, n_expert_used,
|
9135
9227
|
LLM_FFN_SILU, false,
|
9228
|
+
false, 0.0,
|
9136
9229
|
cb, il);
|
9137
9230
|
cb(cur, "ffn_moe_out", il);
|
9138
9231
|
|
@@ -9241,8 +9334,8 @@ struct llm_build_context {
|
|
9241
9334
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9242
9335
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9243
9336
|
|
9244
|
-
Qcur =
|
9245
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9337
|
+
Qcur = ggml_rope_ext(
|
9338
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9246
9339
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9247
9340
|
);
|
9248
9341
|
cb(Qcur, "Qcur", il);
|
@@ -9252,8 +9345,8 @@ struct llm_build_context {
|
|
9252
9345
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9253
9346
|
cb(Qcur, "Qcur", il);
|
9254
9347
|
|
9255
|
-
Kcur =
|
9256
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9348
|
+
Kcur = ggml_rope_ext(
|
9349
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9257
9350
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9258
9351
|
);
|
9259
9352
|
cb(Kcur, "Kcur", il);
|
@@ -9329,6 +9422,9 @@ struct llm_build_context {
|
|
9329
9422
|
|
9330
9423
|
// self-attention
|
9331
9424
|
{
|
9425
|
+
// rope freq factors for 128k context
|
9426
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
9427
|
+
|
9332
9428
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9333
9429
|
model.layers[il].attn_norm,
|
9334
9430
|
NULL,
|
@@ -9360,8 +9456,8 @@ struct llm_build_context {
|
|
9360
9456
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9361
9457
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9362
9458
|
|
9363
|
-
Qcur =
|
9364
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9459
|
+
Qcur = ggml_rope_ext(
|
9460
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9365
9461
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9366
9462
|
);
|
9367
9463
|
cb(Qcur, "Qcur", il);
|
@@ -9369,8 +9465,8 @@ struct llm_build_context {
|
|
9369
9465
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9370
9466
|
cb(Qcur, "Qcur", il);
|
9371
9467
|
|
9372
|
-
Kcur =
|
9373
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9468
|
+
Kcur = ggml_rope_ext(
|
9469
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9374
9470
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9375
9471
|
);
|
9376
9472
|
cb(Kcur, "Kcur", il);
|
@@ -9476,14 +9572,14 @@ struct llm_build_context {
|
|
9476
9572
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9477
9573
|
cb(Vcur, "Vcur", il);
|
9478
9574
|
|
9479
|
-
Qcur =
|
9480
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
9575
|
+
Qcur = ggml_rope_ext(
|
9576
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9481
9577
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9482
9578
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9483
9579
|
cb(Qcur, "Qcur", il);
|
9484
9580
|
|
9485
|
-
Kcur =
|
9486
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
9581
|
+
Kcur = ggml_rope_ext(
|
9582
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9487
9583
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9488
9584
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9489
9585
|
cb(Kcur, "Kcur", il);
|
@@ -9684,15 +9780,15 @@ struct llm_build_context {
|
|
9684
9780
|
cb(tmpk, "tmpk", il);
|
9685
9781
|
cb(Vcur, "Vcur", il);
|
9686
9782
|
|
9687
|
-
struct ggml_tensor * Qcur =
|
9688
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
9783
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9784
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9689
9785
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9690
9786
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9691
9787
|
);
|
9692
9788
|
cb(Qcur, "Qcur", il);
|
9693
9789
|
|
9694
|
-
struct ggml_tensor * Kcur =
|
9695
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9790
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9791
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9696
9792
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9697
9793
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9698
9794
|
);
|
@@ -9800,15 +9896,15 @@ struct llm_build_context {
|
|
9800
9896
|
// cb(Vcur, "Vcur", il);
|
9801
9897
|
// }
|
9802
9898
|
|
9803
|
-
Qcur =
|
9804
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9899
|
+
Qcur = ggml_rope_ext(
|
9900
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9805
9901
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9806
9902
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9807
9903
|
);
|
9808
9904
|
cb(Qcur, "Qcur", il);
|
9809
9905
|
|
9810
|
-
Kcur =
|
9811
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9906
|
+
Kcur = ggml_rope_ext(
|
9907
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9812
9908
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9813
9909
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9814
9910
|
);
|
@@ -9917,15 +10013,15 @@ struct llm_build_context {
|
|
9917
10013
|
cb(Vcur, "Vcur", il);
|
9918
10014
|
}
|
9919
10015
|
|
9920
|
-
Qcur =
|
9921
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10016
|
+
Qcur = ggml_rope_ext(
|
10017
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9922
10018
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9923
10019
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9924
10020
|
);
|
9925
10021
|
cb(Qcur, "Qcur", il);
|
9926
10022
|
|
9927
|
-
Kcur =
|
9928
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10023
|
+
Kcur = ggml_rope_ext(
|
10024
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9929
10025
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9930
10026
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9931
10027
|
);
|
@@ -10047,15 +10143,15 @@ struct llm_build_context {
|
|
10047
10143
|
cb(Vcur, "Vcur", il);
|
10048
10144
|
}
|
10049
10145
|
|
10050
|
-
Qcur =
|
10051
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10146
|
+
Qcur = ggml_rope_ext(
|
10147
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10052
10148
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10053
10149
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10054
10150
|
);
|
10055
10151
|
cb(Qcur, "Qcur", il);
|
10056
10152
|
|
10057
|
-
Kcur =
|
10058
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10153
|
+
Kcur = ggml_rope_ext(
|
10154
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10059
10155
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10060
10156
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10061
10157
|
);
|
@@ -10167,8 +10263,8 @@ struct llm_build_context {
|
|
10167
10263
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10168
10264
|
cb(Vcur, "Vcur", il);
|
10169
10265
|
|
10170
|
-
Qcur =
|
10171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
10266
|
+
Qcur = ggml_rope_ext(
|
10267
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10172
10268
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10173
10269
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10174
10270
|
cb(Qcur, "Qcur", il);
|
@@ -10176,8 +10272,8 @@ struct llm_build_context {
|
|
10176
10272
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
10177
10273
|
cb(Qcur, "Qcur_scaled", il);
|
10178
10274
|
|
10179
|
-
Kcur =
|
10180
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
10275
|
+
Kcur = ggml_rope_ext(
|
10276
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10181
10277
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10182
10278
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10183
10279
|
cb(Kcur, "Kcur", il);
|
@@ -10287,15 +10383,15 @@ struct llm_build_context {
|
|
10287
10383
|
cb(Vcur, "Vcur", il);
|
10288
10384
|
}
|
10289
10385
|
|
10290
|
-
Qcur =
|
10291
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10386
|
+
Qcur = ggml_rope_ext(
|
10387
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10292
10388
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10293
10389
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10294
10390
|
);
|
10295
10391
|
cb(Qcur, "Qcur", il);
|
10296
10392
|
|
10297
|
-
Kcur =
|
10298
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10393
|
+
Kcur = ggml_rope_ext(
|
10394
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10299
10395
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10300
10396
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10301
10397
|
);
|
@@ -10577,15 +10673,15 @@ struct llm_build_context {
|
|
10577
10673
|
cb(Kcur, "Kcur", il);
|
10578
10674
|
}
|
10579
10675
|
|
10580
|
-
Qcur =
|
10581
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10676
|
+
Qcur = ggml_rope_ext(
|
10677
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10582
10678
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10583
10679
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10584
10680
|
);
|
10585
10681
|
cb(Qcur, "Qcur", il);
|
10586
10682
|
|
10587
|
-
Kcur =
|
10588
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10683
|
+
Kcur = ggml_rope_ext(
|
10684
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10589
10685
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10590
10686
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10591
10687
|
);
|
@@ -10680,8 +10776,269 @@ struct llm_build_context {
|
|
10680
10776
|
|
10681
10777
|
// norm
|
10682
10778
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
10683
|
-
NULL, NULL,
|
10684
|
-
LLM_NORM, cb, il);
|
10779
|
+
NULL, NULL,
|
10780
|
+
LLM_NORM, cb, il);
|
10781
|
+
cb(cur, "attn_norm", il);
|
10782
|
+
|
10783
|
+
// self-attention
|
10784
|
+
{
|
10785
|
+
// compute Q and K and RoPE them
|
10786
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10787
|
+
cb(Qcur, "Qcur", il);
|
10788
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10789
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10790
|
+
cb(Qcur, "Qcur", il);
|
10791
|
+
}
|
10792
|
+
|
10793
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10794
|
+
cb(Kcur, "Kcur", il);
|
10795
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10796
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10797
|
+
cb(Kcur, "Kcur", il);
|
10798
|
+
}
|
10799
|
+
|
10800
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10801
|
+
cb(Vcur, "Vcur", il);
|
10802
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10803
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10804
|
+
cb(Vcur, "Vcur", il);
|
10805
|
+
}
|
10806
|
+
|
10807
|
+
Qcur = ggml_rope_ext(
|
10808
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10809
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10810
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10811
|
+
);
|
10812
|
+
cb(Qcur, "Qcur", il);
|
10813
|
+
|
10814
|
+
Kcur = ggml_rope_ext(
|
10815
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10816
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10817
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10818
|
+
);
|
10819
|
+
cb(Kcur, "Kcur", il);
|
10820
|
+
|
10821
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10822
|
+
model.layers[il].wo, nullptr,
|
10823
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10824
|
+
}
|
10825
|
+
|
10826
|
+
if (il == n_layer - 1) {
|
10827
|
+
// skip computing output for unused tokens
|
10828
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10829
|
+
n_tokens = n_outputs;
|
10830
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10831
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10832
|
+
}
|
10833
|
+
|
10834
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10835
|
+
cb(ffn_inp, "ffn_inp", il);
|
10836
|
+
|
10837
|
+
// feed-forward network
|
10838
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10839
|
+
NULL, NULL,
|
10840
|
+
LLM_NORM, cb, il);
|
10841
|
+
cb(cur, "ffn_norm", il);
|
10842
|
+
|
10843
|
+
cur = llm_build_ffn(ctx0, cur,
|
10844
|
+
model.layers[il].ffn_up, NULL,
|
10845
|
+
model.layers[il].ffn_gate, NULL,
|
10846
|
+
model.layers[il].ffn_down, NULL,
|
10847
|
+
NULL,
|
10848
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10849
|
+
cb(cur, "ffn_out", il);
|
10850
|
+
|
10851
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10852
|
+
cb(cur, "ffn_out", il);
|
10853
|
+
|
10854
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10855
|
+
if (layer_dir != nullptr) {
|
10856
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10857
|
+
}
|
10858
|
+
cb(cur, "l_out", il);
|
10859
|
+
|
10860
|
+
// input for next layer
|
10861
|
+
inpL = cur;
|
10862
|
+
}
|
10863
|
+
|
10864
|
+
cur = inpL;
|
10865
|
+
|
10866
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10867
|
+
NULL, NULL,
|
10868
|
+
LLM_NORM, cb, -1);
|
10869
|
+
cb(cur, "result_norm", -1);
|
10870
|
+
|
10871
|
+
// lm_head
|
10872
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10873
|
+
cb(cur, "result_output", -1);
|
10874
|
+
|
10875
|
+
ggml_build_forward_expand(gf, cur);
|
10876
|
+
|
10877
|
+
return gf;
|
10878
|
+
}
|
10879
|
+
|
10880
|
+
struct ggml_cgraph * build_gptneox() {
|
10881
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10882
|
+
|
10883
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10884
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
10885
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10886
|
+
|
10887
|
+
struct ggml_tensor * cur;
|
10888
|
+
struct ggml_tensor * inpL;
|
10889
|
+
|
10890
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10891
|
+
|
10892
|
+
// inp_pos - contains the positions
|
10893
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10894
|
+
|
10895
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10896
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10897
|
+
|
10898
|
+
for (int il = 0; il < n_layer; ++il) {
|
10899
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10900
|
+
model.layers[il].attn_norm,
|
10901
|
+
model.layers[il].attn_norm_b,
|
10902
|
+
LLM_NORM, cb, il);
|
10903
|
+
cb(cur, "attn_norm", il);
|
10904
|
+
|
10905
|
+
// self-attention
|
10906
|
+
{
|
10907
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
10908
|
+
cb(cur, "wqkv", il);
|
10909
|
+
|
10910
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
10911
|
+
cb(cur, "bqkv", il);
|
10912
|
+
|
10913
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
10914
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10915
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10916
|
+
|
10917
|
+
cb(Qcur, "Qcur", il);
|
10918
|
+
cb(Kcur, "Kcur", il);
|
10919
|
+
cb(Vcur, "Vcur", il);
|
10920
|
+
|
10921
|
+
Qcur = ggml_rope_ext(
|
10922
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10923
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10924
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10925
|
+
);
|
10926
|
+
cb(Qcur, "Qcur", il);
|
10927
|
+
|
10928
|
+
Kcur = ggml_rope_ext(
|
10929
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10930
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10931
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10932
|
+
);
|
10933
|
+
cb(Kcur, "Kcur", il);
|
10934
|
+
|
10935
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10936
|
+
model.layers[il].wo, model.layers[il].bo,
|
10937
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10938
|
+
}
|
10939
|
+
|
10940
|
+
if (il == n_layer - 1) {
|
10941
|
+
// skip computing output for unused tokens
|
10942
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10943
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10944
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10945
|
+
}
|
10946
|
+
|
10947
|
+
// ffn
|
10948
|
+
if (hparams.use_par_res) {
|
10949
|
+
// attention and ffn are computed in parallel
|
10950
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
10951
|
+
|
10952
|
+
struct ggml_tensor * attn_out = cur;
|
10953
|
+
|
10954
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10955
|
+
model.layers[il].ffn_norm,
|
10956
|
+
model.layers[il].ffn_norm_b,
|
10957
|
+
LLM_NORM, cb, il);
|
10958
|
+
cb(cur, "ffn_norm", il);
|
10959
|
+
|
10960
|
+
cur = llm_build_ffn(ctx0, cur,
|
10961
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10962
|
+
NULL, NULL,
|
10963
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10964
|
+
NULL,
|
10965
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10966
|
+
cb(cur, "ffn_out", il);
|
10967
|
+
|
10968
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10969
|
+
cb(cur, "ffn_out", il);
|
10970
|
+
|
10971
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
10972
|
+
cb(inpL, "l_out", il);
|
10973
|
+
} else {
|
10974
|
+
// attention and ffn are computed sequentially
|
10975
|
+
// x = x + attn(ln1(x))
|
10976
|
+
// x = x + ffn(ln2(x))
|
10977
|
+
|
10978
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
10979
|
+
cb(ffn_inp, "ffn_inp", il);
|
10980
|
+
|
10981
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10982
|
+
model.layers[il].ffn_norm,
|
10983
|
+
model.layers[il].ffn_norm_b,
|
10984
|
+
LLM_NORM, cb, il);
|
10985
|
+
cb(cur, "ffn_norm", il);
|
10986
|
+
|
10987
|
+
cur = llm_build_ffn(ctx0, cur,
|
10988
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10989
|
+
NULL, NULL,
|
10990
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10991
|
+
NULL,
|
10992
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10993
|
+
cb(cur, "ffn_out", il);
|
10994
|
+
|
10995
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
10996
|
+
cb(inpL, "l_out", il);
|
10997
|
+
}
|
10998
|
+
}
|
10999
|
+
|
11000
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11001
|
+
model.output_norm,
|
11002
|
+
model.output_norm_b,
|
11003
|
+
LLM_NORM, cb, -1);
|
11004
|
+
cb(cur, "result_norm", -1);
|
11005
|
+
|
11006
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11007
|
+
cb(cur, "result_output", -1);
|
11008
|
+
|
11009
|
+
ggml_build_forward_expand(gf, cur);
|
11010
|
+
|
11011
|
+
return gf;
|
11012
|
+
}
|
11013
|
+
|
11014
|
+
struct ggml_cgraph * build_arctic() {
|
11015
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11016
|
+
|
11017
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11018
|
+
int32_t n_tokens = this->n_tokens;
|
11019
|
+
|
11020
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
11021
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
11022
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
11023
|
+
|
11024
|
+
struct ggml_tensor * cur;
|
11025
|
+
struct ggml_tensor * inpL;
|
11026
|
+
|
11027
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11028
|
+
|
11029
|
+
// inp_pos - contains the positions
|
11030
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11031
|
+
|
11032
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11033
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11034
|
+
|
11035
|
+
for (int il = 0; il < n_layer; ++il) {
|
11036
|
+
struct ggml_tensor * inpSA = inpL;
|
11037
|
+
|
11038
|
+
// norm
|
11039
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11040
|
+
model.layers[il].attn_norm, NULL,
|
11041
|
+
LLM_NORM_RMS, cb, il);
|
10685
11042
|
cb(cur, "attn_norm", il);
|
10686
11043
|
|
10687
11044
|
// self-attention
|
@@ -10689,41 +11046,29 @@ struct llm_build_context {
|
|
10689
11046
|
// compute Q and K and RoPE them
|
10690
11047
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10691
11048
|
cb(Qcur, "Qcur", il);
|
10692
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10693
|
-
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10694
|
-
cb(Qcur, "Qcur", il);
|
10695
|
-
}
|
10696
11049
|
|
10697
11050
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10698
11051
|
cb(Kcur, "Kcur", il);
|
10699
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10700
|
-
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10701
|
-
cb(Kcur, "Kcur", il);
|
10702
|
-
}
|
10703
11052
|
|
10704
11053
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10705
11054
|
cb(Vcur, "Vcur", il);
|
10706
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10707
|
-
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10708
|
-
cb(Vcur, "Vcur", il);
|
10709
|
-
}
|
10710
11055
|
|
10711
|
-
Qcur =
|
10712
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
11056
|
+
Qcur = ggml_rope_ext(
|
11057
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10713
11058
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10714
11059
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10715
11060
|
);
|
10716
11061
|
cb(Qcur, "Qcur", il);
|
10717
11062
|
|
10718
|
-
Kcur =
|
10719
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
11063
|
+
Kcur = ggml_rope_ext(
|
11064
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10720
11065
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10721
11066
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10722
11067
|
);
|
10723
11068
|
cb(Kcur, "Kcur", il);
|
10724
11069
|
|
10725
11070
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10726
|
-
model.layers[il].wo,
|
11071
|
+
model.layers[il].wo, NULL,
|
10727
11072
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10728
11073
|
}
|
10729
11074
|
|
@@ -10740,8 +11085,8 @@ struct llm_build_context {
|
|
10740
11085
|
|
10741
11086
|
// feed-forward network
|
10742
11087
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10743
|
-
|
10744
|
-
|
11088
|
+
model.layers[il].ffn_norm, NULL,
|
11089
|
+
LLM_NORM_RMS, cb, il);
|
10745
11090
|
cb(cur, "ffn_norm", il);
|
10746
11091
|
|
10747
11092
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -10752,7 +11097,27 @@ struct llm_build_context {
|
|
10752
11097
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10753
11098
|
cb(cur, "ffn_out", il);
|
10754
11099
|
|
10755
|
-
|
11100
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
11101
|
+
cb(ffn_out, "ffn_out", il);
|
11102
|
+
|
11103
|
+
// MoE
|
11104
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
11105
|
+
model.layers[il].ffn_norm_exps, NULL,
|
11106
|
+
LLM_NORM_RMS, cb, il);
|
11107
|
+
cb(cur, "ffn_norm_exps", il);
|
11108
|
+
|
11109
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
11110
|
+
model.layers[il].ffn_gate_inp,
|
11111
|
+
model.layers[il].ffn_up_exps,
|
11112
|
+
model.layers[il].ffn_gate_exps,
|
11113
|
+
model.layers[il].ffn_down_exps,
|
11114
|
+
n_expert, n_expert_used,
|
11115
|
+
LLM_FFN_SILU, true,
|
11116
|
+
false, 0.0,
|
11117
|
+
cb, il);
|
11118
|
+
cb(cur, "ffn_moe_out", il);
|
11119
|
+
|
11120
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
10756
11121
|
cb(cur, "ffn_out", il);
|
10757
11122
|
|
10758
11123
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
@@ -10768,8 +11133,240 @@ struct llm_build_context {
|
|
10768
11133
|
cur = inpL;
|
10769
11134
|
|
10770
11135
|
cur = llm_build_norm(ctx0, cur, hparams,
|
10771
|
-
|
10772
|
-
|
11136
|
+
model.output_norm, NULL,
|
11137
|
+
LLM_NORM_RMS, cb, -1);
|
11138
|
+
cb(cur, "result_norm", -1);
|
11139
|
+
|
11140
|
+
// lm_head
|
11141
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11142
|
+
cb(cur, "result_output", -1);
|
11143
|
+
|
11144
|
+
ggml_build_forward_expand(gf, cur);
|
11145
|
+
|
11146
|
+
return gf;
|
11147
|
+
}
|
11148
|
+
|
11149
|
+
struct ggml_cgraph * build_deepseek2() {
|
11150
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11151
|
+
|
11152
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11153
|
+
int32_t n_tokens = this->n_tokens;
|
11154
|
+
|
11155
|
+
bool is_lite = (hparams.n_layer == 27);
|
11156
|
+
|
11157
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
11158
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
11159
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
11160
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
|
11161
|
+
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
11162
|
+
|
11163
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
11164
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
11165
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
11166
|
+
|
11167
|
+
struct ggml_tensor * cur;
|
11168
|
+
struct ggml_tensor * inpL;
|
11169
|
+
|
11170
|
+
// {n_embd, n_tokens}
|
11171
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11172
|
+
|
11173
|
+
// inp_pos - contains the positions
|
11174
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11175
|
+
|
11176
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11177
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11178
|
+
|
11179
|
+
for (int il = 0; il < n_layer; ++il) {
|
11180
|
+
struct ggml_tensor * inpSA = inpL;
|
11181
|
+
|
11182
|
+
// norm
|
11183
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11184
|
+
model.layers[il].attn_norm, NULL,
|
11185
|
+
LLM_NORM_RMS, cb, il);
|
11186
|
+
cb(cur, "attn_norm", il);
|
11187
|
+
|
11188
|
+
// self_attention
|
11189
|
+
{
|
11190
|
+
struct ggml_tensor * q = NULL;
|
11191
|
+
if (!is_lite) {
|
11192
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
11193
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
11194
|
+
cb(q, "q", il);
|
11195
|
+
|
11196
|
+
q = llm_build_norm(ctx0, q, hparams,
|
11197
|
+
model.layers[il].attn_q_a_norm, NULL,
|
11198
|
+
LLM_NORM_RMS, cb, il);
|
11199
|
+
cb(q, "q", il);
|
11200
|
+
|
11201
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
11202
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
11203
|
+
cb(q, "q", il);
|
11204
|
+
} else {
|
11205
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
11206
|
+
cb(q, "q", il);
|
11207
|
+
}
|
11208
|
+
|
11209
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11210
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
11211
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11212
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11213
|
+
0);
|
11214
|
+
cb(q_nope, "q_nope", il);
|
11215
|
+
|
11216
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
11217
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
11218
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11219
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11220
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
11221
|
+
cb(q_pe, "q_pe", il);
|
11222
|
+
|
11223
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
11224
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
11225
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
11226
|
+
|
11227
|
+
// split into {kv_lora_rank, n_tokens}
|
11228
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
11229
|
+
kv_pe_compresseed->nb[1],
|
11230
|
+
0);
|
11231
|
+
cb(kv_compressed, "kv_compressed", il);
|
11232
|
+
|
11233
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
11234
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
11235
|
+
kv_pe_compresseed->nb[1],
|
11236
|
+
kv_pe_compresseed->nb[1],
|
11237
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
11238
|
+
cb(k_pe, "k_pe", il);
|
11239
|
+
|
11240
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
11241
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
11242
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
11243
|
+
LLM_NORM_RMS, cb, il);
|
11244
|
+
cb(kv_compressed, "kv_compressed", il);
|
11245
|
+
|
11246
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
11247
|
+
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
11248
|
+
cb(kv, "kv", il);
|
11249
|
+
|
11250
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11251
|
+
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
11252
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
11253
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11254
|
+
0);
|
11255
|
+
cb(k_nope, "k_nope", il);
|
11256
|
+
|
11257
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
11258
|
+
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
11259
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11260
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
11261
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
11262
|
+
cb(v_states, "v_states", il);
|
11263
|
+
|
11264
|
+
v_states = ggml_cont(ctx0, v_states);
|
11265
|
+
cb(v_states, "v_states", il);
|
11266
|
+
|
11267
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
11268
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
11269
|
+
0);
|
11270
|
+
cb(v_states, "v_states", il);
|
11271
|
+
|
11272
|
+
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11273
|
+
q_pe = ggml_rope_ext(
|
11274
|
+
ctx0, q_pe, inp_pos, nullptr,
|
11275
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11276
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11277
|
+
);
|
11278
|
+
cb(q_pe, "q_pe", il);
|
11279
|
+
|
11280
|
+
// shared RoPE key
|
11281
|
+
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11282
|
+
k_pe = ggml_rope_ext(
|
11283
|
+
ctx0, k_pe, inp_pos, nullptr,
|
11284
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11285
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11286
|
+
);
|
11287
|
+
cb(k_pe, "k_pe", il);
|
11288
|
+
|
11289
|
+
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
11290
|
+
cb(q_states, "q_states", il);
|
11291
|
+
|
11292
|
+
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
11293
|
+
cb(k_states, "k_states", il);
|
11294
|
+
|
11295
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
11296
|
+
model.layers[il].wo, NULL,
|
11297
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11298
|
+
}
|
11299
|
+
|
11300
|
+
if (il == n_layer - 1) {
|
11301
|
+
// skip computing output for unused tokens
|
11302
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11303
|
+
n_tokens = n_outputs;
|
11304
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
11305
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11306
|
+
}
|
11307
|
+
|
11308
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
11309
|
+
cb(ffn_inp, "ffn_inp", il);
|
11310
|
+
|
11311
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
11312
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11313
|
+
model.layers[il].ffn_norm, NULL,
|
11314
|
+
LLM_NORM_RMS, cb, il);
|
11315
|
+
cb(cur, "ffn_norm", il);
|
11316
|
+
|
11317
|
+
cur = llm_build_ffn(ctx0, cur,
|
11318
|
+
model.layers[il].ffn_up, NULL,
|
11319
|
+
model.layers[il].ffn_gate, NULL,
|
11320
|
+
model.layers[il].ffn_down, NULL,
|
11321
|
+
NULL,
|
11322
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11323
|
+
cb(cur, "ffn_out", il);
|
11324
|
+
} else {
|
11325
|
+
// MoE branch
|
11326
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11327
|
+
model.layers[il].ffn_norm, NULL,
|
11328
|
+
LLM_NORM_RMS, cb, il);
|
11329
|
+
cb(cur, "ffn_norm", il);
|
11330
|
+
|
11331
|
+
ggml_tensor * moe_out =
|
11332
|
+
llm_build_moe_ffn(ctx0, cur,
|
11333
|
+
model.layers[il].ffn_gate_inp,
|
11334
|
+
model.layers[il].ffn_up_exps,
|
11335
|
+
model.layers[il].ffn_gate_exps,
|
11336
|
+
model.layers[il].ffn_down_exps,
|
11337
|
+
n_expert, n_expert_used,
|
11338
|
+
LLM_FFN_SILU, false,
|
11339
|
+
true, hparams.expert_weights_scale,
|
11340
|
+
cb, il);
|
11341
|
+
cb(moe_out, "ffn_moe_out", il);
|
11342
|
+
|
11343
|
+
// FFN shared expert
|
11344
|
+
{
|
11345
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
11346
|
+
model.layers[il].ffn_up_shexp, NULL,
|
11347
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
11348
|
+
model.layers[il].ffn_down_shexp, NULL,
|
11349
|
+
NULL,
|
11350
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11351
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
11352
|
+
|
11353
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
11354
|
+
cb(cur, "ffn_out", il);
|
11355
|
+
}
|
11356
|
+
}
|
11357
|
+
|
11358
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
11359
|
+
cb(cur, "l_out", il);
|
11360
|
+
|
11361
|
+
// input for next layer
|
11362
|
+
inpL = cur;
|
11363
|
+
}
|
11364
|
+
|
11365
|
+
cur = inpL;
|
11366
|
+
|
11367
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11368
|
+
model.output_norm, NULL,
|
11369
|
+
LLM_NORM_RMS, cb, -1);
|
10773
11370
|
cb(cur, "result_norm", -1);
|
10774
11371
|
|
10775
11372
|
// lm_head
|
@@ -10780,6 +11377,7 @@ struct llm_build_context {
|
|
10780
11377
|
|
10781
11378
|
return gf;
|
10782
11379
|
}
|
11380
|
+
|
10783
11381
|
};
|
10784
11382
|
|
10785
11383
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -10896,10 +11494,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10896
11494
|
{
|
10897
11495
|
result = llm.build_starcoder();
|
10898
11496
|
} break;
|
10899
|
-
case LLM_ARCH_PERSIMMON:
|
10900
|
-
{
|
10901
|
-
result = llm.build_persimmon();
|
10902
|
-
} break;
|
10903
11497
|
case LLM_ARCH_REFACT:
|
10904
11498
|
{
|
10905
11499
|
result = llm.build_refact();
|
@@ -10994,6 +11588,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10994
11588
|
{
|
10995
11589
|
result = llm.build_olmo();
|
10996
11590
|
} break;
|
11591
|
+
case LLM_ARCH_GPTNEOX:
|
11592
|
+
{
|
11593
|
+
result = llm.build_gptneox();
|
11594
|
+
} break;
|
11595
|
+
case LLM_ARCH_ARCTIC:
|
11596
|
+
{
|
11597
|
+
result = llm.build_arctic();
|
11598
|
+
} break;
|
11599
|
+
case LLM_ARCH_DEEPSEEK2:
|
11600
|
+
{
|
11601
|
+
result = llm.build_deepseek2();
|
11602
|
+
} break;
|
10997
11603
|
default:
|
10998
11604
|
GGML_ASSERT(false);
|
10999
11605
|
}
|
@@ -11339,11 +11945,6 @@ static void llama_graph_compute(
|
|
11339
11945
|
llama_context & lctx,
|
11340
11946
|
ggml_cgraph * gf,
|
11341
11947
|
int n_threads) {
|
11342
|
-
#ifdef GGML_USE_MPI
|
11343
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
11344
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
11345
|
-
#endif
|
11346
|
-
|
11347
11948
|
#ifdef GGML_USE_METAL
|
11348
11949
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
11349
11950
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -11358,10 +11959,6 @@ static void llama_graph_compute(
|
|
11358
11959
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11359
11960
|
|
11360
11961
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
11361
|
-
|
11362
|
-
#ifdef GGML_USE_MPI
|
11363
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
11364
|
-
#endif
|
11365
11962
|
}
|
11366
11963
|
|
11367
11964
|
// decode a batch of tokens by evaluating the transformer
|
@@ -11399,12 +11996,6 @@ static int llama_decode_internal(
|
|
11399
11996
|
}
|
11400
11997
|
lctx.n_queued_tokens += n_tokens_all;
|
11401
11998
|
|
11402
|
-
#ifdef GGML_USE_MPI
|
11403
|
-
// TODO: needs fix after #3228
|
11404
|
-
GGML_ASSERT(false && "not implemented");
|
11405
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
11406
|
-
#endif
|
11407
|
-
|
11408
11999
|
auto & kv_self = lctx.kv_self;
|
11409
12000
|
|
11410
12001
|
const int64_t n_embd = hparams.n_embd;
|
@@ -12298,6 +12889,7 @@ struct llm_tokenizer_bpe {
|
|
12298
12889
|
});
|
12299
12890
|
break;
|
12300
12891
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12892
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
12301
12893
|
word_collection = unicode_regex_split(text, {
|
12302
12894
|
// same as llama3
|
12303
12895
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12354,6 +12946,7 @@ struct llm_tokenizer_bpe {
|
|
12354
12946
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12355
12947
|
});
|
12356
12948
|
break;
|
12949
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
12357
12950
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12358
12951
|
word_collection = unicode_regex_split(text, {
|
12359
12952
|
// original regex from tokenizer.json
|
@@ -12519,7 +13112,7 @@ struct llm_tokenizer_wpm {
|
|
12519
13112
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
12520
13113
|
|
12521
13114
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12522
|
-
auto
|
13115
|
+
const auto & token_map = vocab.token_to_id;
|
12523
13116
|
|
12524
13117
|
// normalize and split by whitespace
|
12525
13118
|
std::vector<std::string> words = preprocess(text);
|
@@ -12534,108 +13127,89 @@ struct llm_tokenizer_wpm {
|
|
12534
13127
|
}
|
12535
13128
|
|
12536
13129
|
// prepend phantom space
|
12537
|
-
std::string word1 = "\xe2\x96\x81" + word;
|
12538
|
-
int n = word1.size();
|
13130
|
+
const std::string word1 = "\xe2\x96\x81" + word;
|
13131
|
+
const int n = word1.size();
|
12539
13132
|
|
12540
|
-
|
12541
|
-
int i = 0;
|
12542
|
-
bool match_any = false;
|
13133
|
+
const size_t current_tokens = output.size();
|
12543
13134
|
|
13135
|
+
// we're at the start of a new word
|
12544
13136
|
// move through character position in word
|
12545
|
-
|
13137
|
+
for (int i = 0; i < n; ++i) {
|
12546
13138
|
// loop through possible match length
|
12547
13139
|
bool match = false;
|
12548
13140
|
for (int j = n; j > i; j--) {
|
12549
|
-
auto it = token_map
|
12550
|
-
if (it != token_map
|
13141
|
+
auto it = token_map.find(word1.substr(i, j - i));
|
13142
|
+
if (it != token_map.end()) {
|
12551
13143
|
output.push_back(it->second);
|
12552
13144
|
match = true;
|
12553
|
-
|
12554
|
-
i = j;
|
13145
|
+
i = j - 1;
|
12555
13146
|
break;
|
12556
13147
|
}
|
12557
13148
|
}
|
12558
13149
|
|
12559
|
-
|
12560
|
-
|
12561
|
-
|
13150
|
+
if (!match) { // discard all
|
13151
|
+
output.resize(current_tokens);
|
13152
|
+
break; // and discard next tokens
|
12562
13153
|
}
|
12563
13154
|
}
|
12564
13155
|
|
12565
13156
|
// we didn't find any matches for this word
|
12566
|
-
if (
|
13157
|
+
if (current_tokens == output.size()) {
|
12567
13158
|
output.push_back(vocab.special_unk_id);
|
12568
13159
|
}
|
12569
13160
|
}
|
12570
13161
|
}
|
12571
13162
|
|
12572
13163
|
std::vector<std::string> preprocess(const std::string & text) {
|
12573
|
-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
12574
|
-
|
12575
|
-
|
12576
|
-
|
12577
|
-
|
12578
|
-
|
12579
|
-
|
12580
|
-
|
13164
|
+
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13165
|
+
std::vector<std::string> words(1, "");
|
13166
|
+
|
13167
|
+
for (const char32_t cpt : cpts_nfd) {
|
13168
|
+
const auto flags = unicode_cpt_flags(cpt);
|
13169
|
+
|
13170
|
+
if (flags.is_whitespace) {
|
13171
|
+
if (words.back().size()) { // finish previous word if any
|
13172
|
+
words.emplace_back();
|
13173
|
+
}
|
12581
13174
|
continue;
|
12582
13175
|
}
|
12583
|
-
|
12584
|
-
|
12585
|
-
|
12586
|
-
|
12587
|
-
std::string s = unicode_cpt_to_utf8(code);
|
12588
|
-
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12589
|
-
new_str += " ";
|
12590
|
-
new_str += s;
|
12591
|
-
new_str += " ";
|
12592
|
-
} else {
|
12593
|
-
new_str += s;
|
13176
|
+
|
13177
|
+
assert (!flags.is_separator);
|
13178
|
+
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
|
13179
|
+
continue;
|
12594
13180
|
}
|
12595
|
-
}
|
12596
13181
|
|
12597
|
-
|
12598
|
-
|
12599
|
-
|
12600
|
-
|
12601
|
-
|
12602
|
-
|
12603
|
-
|
12604
|
-
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
12605
|
-
l = r + 1;
|
12606
|
-
r = l;
|
13182
|
+
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
13183
|
+
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
13184
|
+
if (words.back().size()) { // finish previous word if any
|
13185
|
+
words.emplace_back();
|
13186
|
+
}
|
13187
|
+
words.back() = s; // single char word
|
13188
|
+
words.emplace_back(); // start a new word
|
12607
13189
|
} else {
|
12608
|
-
|
13190
|
+
words.back() += s; // append char to word
|
12609
13191
|
}
|
12610
13192
|
}
|
12611
|
-
if (r > l) {
|
12612
|
-
words.push_back(new_str.substr(l, (r - l)));
|
12613
|
-
}
|
12614
|
-
return words;
|
12615
|
-
}
|
12616
13193
|
|
12617
|
-
|
12618
|
-
|
12619
|
-
return false;
|
13194
|
+
if (!words.back().size()) {
|
13195
|
+
words.pop_back();
|
12620
13196
|
}
|
12621
|
-
|
12622
|
-
return
|
13197
|
+
|
13198
|
+
return words;
|
12623
13199
|
}
|
12624
13200
|
|
12625
|
-
bool is_chinese_char(uint32_t cpt) {
|
12626
|
-
|
12627
|
-
(cpt >=
|
13201
|
+
static bool is_chinese_char(uint32_t cpt) {
|
13202
|
+
return
|
13203
|
+
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
|
13204
|
+
(cpt >= 0x03400 && cpt <= 0x04DBF) ||
|
12628
13205
|
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
12629
13206
|
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
12630
13207
|
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
12631
13208
|
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
12632
|
-
(cpt >=
|
12633
|
-
(cpt >= 0x2F800 && cpt <= 0x2FA1F)
|
12634
|
-
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
12635
|
-
(cpt >= 0xFF00 && cpt <= 0xFFEF)
|
12636
|
-
return true; // NOLINT
|
12637
|
-
}
|
12638
|
-
return false;
|
13209
|
+
(cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
|
13210
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F);
|
13211
|
+
//(cpt >= 0x3000 && cpt <= 0x303F) ||
|
13212
|
+
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
12639
13213
|
}
|
12640
13214
|
|
12641
13215
|
const llama_vocab & vocab;
|
@@ -12679,9 +13253,8 @@ struct fragment_buffer_variant {
|
|
12679
13253
|
|
12680
13254
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
12681
13255
|
// for each special token
|
12682
|
-
for (const
|
12683
|
-
const auto & special_token =
|
12684
|
-
const auto & special_id = st.second;
|
13256
|
+
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
13257
|
+
const auto & special_token = vocab.id_to_token[special_id].text;
|
12685
13258
|
|
12686
13259
|
// for each text fragment
|
12687
13260
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
@@ -12690,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12690
13263
|
|
12691
13264
|
// if a fragment is text ( not yet processed )
|
12692
13265
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
12693
|
-
auto
|
13266
|
+
auto & raw_text = fragment.raw_text;
|
12694
13267
|
|
12695
13268
|
auto raw_text_base_offset = fragment.offset;
|
12696
13269
|
auto raw_text_base_length = fragment.length;
|
@@ -12700,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12700
13273
|
// find the first occurrence of a given special token in this fragment
|
12701
13274
|
// passing offset argument only limit the "search area" but match coordinates
|
12702
13275
|
// are still relative to the source full raw_text
|
12703
|
-
auto match = raw_text
|
13276
|
+
auto match = raw_text.find(special_token, raw_text_base_offset);
|
12704
13277
|
|
12705
13278
|
// no occurrences found, stop processing this fragment for a given special token
|
12706
13279
|
if (match == std::string::npos) break;
|
@@ -12719,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12719
13292
|
// left
|
12720
13293
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
12721
13294
|
const int64_t left_reminder_length = match - raw_text_base_offset;
|
12722
|
-
buffer.emplace_after(it,
|
13295
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
12723
13296
|
|
12724
13297
|
#ifdef PRETOKENIZERDEBUG
|
12725
13298
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
@@ -12735,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12735
13308
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
12736
13309
|
const int64_t right_reminder_offset = match + special_token.length();
|
12737
13310
|
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
12738
|
-
buffer.emplace_after(it,
|
13311
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
12739
13312
|
|
12740
13313
|
#ifdef PRETOKENIZERDEBUG
|
12741
13314
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
@@ -12788,9 +13361,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12788
13361
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12789
13362
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
12790
13363
|
|
13364
|
+
static const bool rtrim = true; //TODO: as param
|
13365
|
+
bool is_prev_special = false;
|
13366
|
+
bool special_token_rtrim = false;
|
13367
|
+
|
12791
13368
|
if (add_special && vocab.special_add_bos != 0) {
|
12792
13369
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12793
13370
|
output.push_back(vocab.special_bos_id);
|
13371
|
+
is_prev_special = true;
|
12794
13372
|
}
|
12795
13373
|
|
12796
13374
|
for (const auto & fragment : fragment_buffer) {
|
@@ -12802,9 +13380,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12802
13380
|
// and passing 'add space prefix' as bool argument
|
12803
13381
|
//
|
12804
13382
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
12805
|
-
|
12806
|
-
|
12807
|
-
|
13383
|
+
|
13384
|
+
if (special_token_rtrim) {
|
13385
|
+
size_t num_whitespaces = 0;
|
13386
|
+
while (isspace(raw_text[num_whitespaces])) {
|
13387
|
+
num_whitespaces++;
|
13388
|
+
}
|
13389
|
+
if (num_whitespaces == raw_text.size()) {
|
13390
|
+
continue; // skip if all whitespaces
|
13391
|
+
}
|
13392
|
+
raw_text = raw_text.substr(num_whitespaces);
|
13393
|
+
}
|
13394
|
+
|
13395
|
+
if (vocab.add_space_prefix) {
|
13396
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13397
|
+
raw_text = " " + raw_text;
|
12808
13398
|
}
|
12809
13399
|
}
|
12810
13400
|
|
@@ -12816,6 +13406,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12816
13406
|
tokenizer.tokenize(raw_text, output);
|
12817
13407
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
12818
13408
|
output.push_back(fragment.token);
|
13409
|
+
is_prev_special = true;
|
13410
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13411
|
+
special_token_rtrim = rtrim
|
13412
|
+
&& fragment.token != vocab.special_bos_id
|
13413
|
+
&& fragment.token != vocab.special_unk_id
|
13414
|
+
&& fragment.token != vocab.special_eos_id;
|
12819
13415
|
}
|
12820
13416
|
}
|
12821
13417
|
|
@@ -13816,7 +14412,7 @@ void llama_sample_repetition_penalties(
|
|
13816
14412
|
|
13817
14413
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
13818
14414
|
GGML_ASSERT(ctx);
|
13819
|
-
|
14415
|
+
int64_t t_start_sample_us = ggml_time_us();
|
13820
14416
|
|
13821
14417
|
bool allow_eog = false;
|
13822
14418
|
for (const auto & stack : grammar->stacks) {
|
@@ -13828,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13828
14424
|
|
13829
14425
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
13830
14426
|
candidates_decoded.reserve(candidates->size);
|
13831
|
-
|
14427
|
+
|
14428
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
13832
14429
|
candidates_grammar.reserve(candidates->size);
|
13833
14430
|
|
13834
14431
|
for (size_t i = 0; i < candidates->size; ++i) {
|
13835
|
-
const llama_token id
|
13836
|
-
const std::string piece =
|
14432
|
+
const llama_token id = candidates->data[i].id;
|
14433
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
13837
14434
|
|
13838
14435
|
if (llama_token_is_eog(&ctx->model, id)) {
|
13839
14436
|
if (!allow_eog) {
|
@@ -14033,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14033
14630
|
GGML_ASSERT(false);
|
14034
14631
|
}
|
14035
14632
|
|
14036
|
-
const std::string piece =
|
14633
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
14037
14634
|
|
14038
14635
|
// Note terminating 0 in decoded string
|
14039
14636
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14518,8 +15115,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14518
15115
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
14519
15116
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
14520
15117
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
14521
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
14522
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
14523
15118
|
if (qs.model.type == MODEL_70B) {
|
14524
15119
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
14525
15120
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
@@ -15533,10 +16128,6 @@ void llama_backend_init(void) {
|
|
15533
16128
|
struct ggml_context * ctx = ggml_init(params);
|
15534
16129
|
ggml_free(ctx);
|
15535
16130
|
}
|
15536
|
-
|
15537
|
-
#ifdef GGML_USE_MPI
|
15538
|
-
ggml_mpi_backend_init();
|
15539
|
-
#endif
|
15540
16131
|
}
|
15541
16132
|
|
15542
16133
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
@@ -15546,9 +16137,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
15546
16137
|
}
|
15547
16138
|
|
15548
16139
|
void llama_backend_free(void) {
|
15549
|
-
#ifdef GGML_USE_MPI
|
15550
|
-
ggml_mpi_backend_free();
|
15551
|
-
#endif
|
15552
16140
|
ggml_quantize_free();
|
15553
16141
|
}
|
15554
16142
|
|
@@ -15691,6 +16279,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15691
16279
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
15692
16280
|
}
|
15693
16281
|
|
16282
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
15694
16283
|
cparams.causal_attn = hparams.causal_attn;
|
15695
16284
|
|
15696
16285
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
@@ -15949,20 +16538,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15949
16538
|
}
|
15950
16539
|
}
|
15951
16540
|
|
15952
|
-
#ifdef GGML_USE_MPI
|
15953
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
15954
|
-
|
15955
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
15956
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
15957
|
-
// TODO: needs fix after #3228
|
15958
|
-
GGML_ASSERT(false && "not implemented");
|
15959
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
15960
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
15961
|
-
llama_backend_free();
|
15962
|
-
exit(1);
|
15963
|
-
}
|
15964
|
-
#endif
|
15965
|
-
|
15966
16541
|
return ctx;
|
15967
16542
|
}
|
15968
16543
|
|
@@ -15999,7 +16574,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15999
16574
|
// these models do not use RoPE
|
16000
16575
|
case LLM_ARCH_GPT2:
|
16001
16576
|
case LLM_ARCH_GPTJ:
|
16002
|
-
case LLM_ARCH_GPTNEOX:
|
16003
16577
|
case LLM_ARCH_MPT:
|
16004
16578
|
case LLM_ARCH_REFACT:
|
16005
16579
|
case LLM_ARCH_BLOOM:
|
@@ -16019,13 +16593,14 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16019
16593
|
case LLM_ARCH_XVERSE:
|
16020
16594
|
case LLM_ARCH_COMMAND_R:
|
16021
16595
|
case LLM_ARCH_OLMO:
|
16596
|
+
case LLM_ARCH_ARCTIC:
|
16597
|
+
case LLM_ARCH_DEEPSEEK2:
|
16022
16598
|
return LLAMA_ROPE_TYPE_NORM;
|
16023
16599
|
|
16024
16600
|
// the pairs of head values are offset by n_rot/2
|
16025
16601
|
case LLM_ARCH_FALCON:
|
16026
16602
|
case LLM_ARCH_GROK:
|
16027
16603
|
case LLM_ARCH_DBRX:
|
16028
|
-
case LLM_ARCH_PERSIMMON:
|
16029
16604
|
case LLM_ARCH_BERT:
|
16030
16605
|
case LLM_ARCH_NOMIC_BERT:
|
16031
16606
|
case LLM_ARCH_STABLELM:
|
@@ -16036,6 +16611,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16036
16611
|
case LLM_ARCH_PHI3:
|
16037
16612
|
case LLM_ARCH_GEMMA:
|
16038
16613
|
case LLM_ARCH_STARCODER2:
|
16614
|
+
case LLM_ARCH_GPTNEOX:
|
16039
16615
|
return LLAMA_ROPE_TYPE_NEOX;
|
16040
16616
|
|
16041
16617
|
// all model arches should be listed explicitly here
|
@@ -16195,6 +16771,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16195
16771
|
}
|
16196
16772
|
|
16197
16773
|
// make tensors
|
16774
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
16198
16775
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
16199
16776
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
16200
16777
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
@@ -16203,6 +16780,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16203
16780
|
}
|
16204
16781
|
|
16205
16782
|
// allocate tensors / buffers and zero
|
16783
|
+
cvec.ctxs.reserve(ctx_map.size());
|
16784
|
+
cvec.bufs.reserve(ctx_map.size());
|
16206
16785
|
for (auto it : ctx_map) {
|
16207
16786
|
ggml_backend_buffer_type_t buft = it.first;
|
16208
16787
|
ggml_context * ctx = it.second;
|
@@ -17411,6 +17990,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
17411
17990
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
17412
17991
|
}
|
17413
17992
|
|
17993
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
17994
|
+
return ctx->cparams.n_threads;
|
17995
|
+
}
|
17996
|
+
|
17997
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
17998
|
+
return ctx->cparams.n_threads_batch;
|
17999
|
+
}
|
18000
|
+
|
17414
18001
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
17415
18002
|
ctx->abort_callback = abort_callback;
|
17416
18003
|
ctx->abort_callback_data = abort_callback_data;
|
@@ -17634,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
17634
18221
|
);
|
17635
18222
|
}
|
17636
18223
|
|
18224
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
18225
|
+
return llama_is_control_token(model->vocab, token);
|
18226
|
+
}
|
18227
|
+
|
17637
18228
|
llama_token llama_token_bos(const struct llama_model * model) {
|
17638
18229
|
return model->vocab.special_bos_id;
|
17639
18230
|
}
|
@@ -17705,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17705
18296
|
|
17706
18297
|
const auto cpts = unicode_cpts_from_utf8(text);
|
17707
18298
|
for (const auto cpt : cpts) {
|
17708
|
-
|
18299
|
+
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
18300
|
+
try {
|
18301
|
+
decoded_text += unicode_utf8_to_byte(utf8);
|
18302
|
+
} catch (const std::out_of_range & e) {
|
18303
|
+
decoded_text += "[UNK_BYTE_0x";
|
18304
|
+
for (const auto c : utf8) {
|
18305
|
+
decoded_text += format("%02x", (uint8_t) c);
|
18306
|
+
}
|
18307
|
+
decoded_text += text + "]";
|
18308
|
+
}
|
17709
18309
|
}
|
17710
18310
|
|
17711
18311
|
return decoded_text;
|
@@ -17713,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17713
18313
|
|
17714
18314
|
// does not write null-terminator to buf
|
17715
18315
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
18316
|
+
// if we have a cache - use it
|
18317
|
+
{
|
18318
|
+
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
18319
|
+
|
18320
|
+
if (!cache.empty()) {
|
18321
|
+
const auto & res = cache.at(token);
|
18322
|
+
if (length < (int) res.size()) {
|
18323
|
+
return -(int) res.size();
|
18324
|
+
}
|
18325
|
+
memcpy(buf, res.c_str(), res.size());
|
18326
|
+
return res.size();
|
18327
|
+
}
|
18328
|
+
}
|
18329
|
+
|
17716
18330
|
if (0 <= token && token < llama_n_vocab(model)) {
|
17717
18331
|
switch (llama_vocab_get_type(model->vocab)) {
|
17718
|
-
|
17719
|
-
|
17720
|
-
|
17721
|
-
|
17722
|
-
|
17723
|
-
|
17724
|
-
|
17725
|
-
|
17726
|
-
|
17727
|
-
|
17728
|
-
|
17729
|
-
|
17730
|
-
|
17731
|
-
|
17732
|
-
|
17733
|
-
|
17734
|
-
|
17735
|
-
|
17736
|
-
|
17737
|
-
|
17738
|
-
|
17739
|
-
|
17740
|
-
|
17741
|
-
|
17742
|
-
|
17743
|
-
|
17744
|
-
|
17745
|
-
|
17746
|
-
|
17747
|
-
|
18332
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
18333
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
18334
|
+
// NOTE: we accept all unsupported token types,
|
18335
|
+
// suppressing them like CONTROL tokens.
|
18336
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18337
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18338
|
+
llama_unescape_whitespace(result);
|
18339
|
+
if (length < (int) result.length()) {
|
18340
|
+
return -(int) result.length();
|
18341
|
+
}
|
18342
|
+
memcpy(buf, result.c_str(), result.length());
|
18343
|
+
return result.length();
|
18344
|
+
} else if (
|
18345
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18346
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18347
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18348
|
+
if (length < (int) result.length()) {
|
18349
|
+
return -(int) result.length();
|
18350
|
+
}
|
18351
|
+
memcpy(buf, result.c_str(), result.length());
|
18352
|
+
return result.length();
|
18353
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
18354
|
+
if (length < 3) {
|
18355
|
+
return -3;
|
18356
|
+
}
|
18357
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
18358
|
+
return 3;
|
18359
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
18360
|
+
if (length < 1) {
|
18361
|
+
return -1;
|
18362
|
+
}
|
18363
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
18364
|
+
return 1;
|
17748
18365
|
}
|
17749
|
-
|
17750
|
-
return 1;
|
18366
|
+
break;
|
17751
18367
|
}
|
17752
|
-
|
17753
|
-
|
17754
|
-
|
17755
|
-
|
17756
|
-
|
17757
|
-
|
17758
|
-
|
17759
|
-
|
17760
|
-
|
17761
|
-
|
17762
|
-
|
17763
|
-
|
17764
|
-
|
17765
|
-
|
17766
|
-
|
17767
|
-
(
|
17768
|
-
|
17769
|
-
|
17770
|
-
|
18368
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
18369
|
+
// NOTE: we accept all unsupported token types,
|
18370
|
+
// suppressing them like CONTROL tokens.
|
18371
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18372
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18373
|
+
result = llama_decode_text(result);
|
18374
|
+
if (length < (int) result.length()) {
|
18375
|
+
return -(int) result.length();
|
18376
|
+
}
|
18377
|
+
memcpy(buf, result.c_str(), result.length());
|
18378
|
+
return result.length();
|
18379
|
+
} else if (
|
18380
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18381
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18382
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18383
|
+
if (length < (int) result.length()) {
|
18384
|
+
return -(int) result.length();
|
18385
|
+
}
|
18386
|
+
memcpy(buf, result.c_str(), result.length());
|
18387
|
+
return result.length();
|
17771
18388
|
}
|
17772
|
-
|
17773
|
-
return result.length();
|
18389
|
+
break;
|
17774
18390
|
}
|
17775
|
-
|
17776
|
-
|
17777
|
-
default:
|
17778
|
-
GGML_ASSERT(false);
|
18391
|
+
default:
|
18392
|
+
GGML_ASSERT(false);
|
17779
18393
|
}
|
17780
18394
|
}
|
17781
18395
|
return 0;
|
@@ -17845,6 +18459,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
17845
18459
|
}
|
17846
18460
|
}
|
17847
18461
|
// llama2 templates seem to not care about "add_generation_prompt"
|
18462
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
18463
|
+
// Phi 3
|
18464
|
+
for (auto message : chat) {
|
18465
|
+
std::string role(message->role);
|
18466
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
18467
|
+
}
|
18468
|
+
if (add_ass) {
|
18469
|
+
ss << "<|assistant|>\n";
|
18470
|
+
}
|
17848
18471
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
17849
18472
|
// zephyr template
|
17850
18473
|
for (auto message : chat) {
|
@@ -17977,15 +18600,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
17977
18600
|
if (add_ass) {
|
17978
18601
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17979
18602
|
}
|
17980
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17981
|
-
// Phi 3
|
17982
|
-
for (auto message : chat) {
|
17983
|
-
std::string role(message->role);
|
17984
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17985
|
-
}
|
17986
|
-
if (add_ass) {
|
17987
|
-
ss << "<|assistant|>\n";
|
17988
|
-
}
|
17989
18603
|
} else {
|
17990
18604
|
// template not supported
|
17991
18605
|
return -1;
|
@@ -18107,8 +18721,10 @@ const char * llama_print_system_info(void) {
|
|
18107
18721
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
18108
18722
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
18109
18723
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
18724
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18110
18725
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18111
18726
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18727
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
18112
18728
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
18113
18729
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
18114
18730
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
@@ -18167,6 +18783,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
18167
18783
|
g_state.log_callback_user_data = user_data;
|
18168
18784
|
#ifdef GGML_USE_METAL
|
18169
18785
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18786
|
+
#elif defined(GGML_USE_CUDA)
|
18787
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18170
18788
|
#endif
|
18171
18789
|
}
|
18172
18790
|
|