llama_cpp 0.15.2 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
@@ -26,16 +26,9 @@
|
|
26
26
|
#ifdef GGML_USE_METAL
|
27
27
|
# include "ggml-metal.h"
|
28
28
|
#endif
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
#ifndef QK_K
|
33
|
-
# ifdef GGML_QKK_64
|
34
|
-
# define QK_K 64
|
35
|
-
# else
|
36
|
-
# define QK_K 256
|
37
|
-
# endif
|
38
|
-
#endif
|
29
|
+
|
30
|
+
// TODO: replace with ggml API call
|
31
|
+
#define QK_K 256
|
39
32
|
|
40
33
|
#ifdef __has_include
|
41
34
|
#if __has_include(<unistd.h>)
|
@@ -110,7 +103,7 @@
|
|
110
103
|
#endif
|
111
104
|
|
112
105
|
#define LLAMA_MAX_NODES 8192
|
113
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 160
|
114
107
|
|
115
108
|
//
|
116
109
|
// logging
|
@@ -205,7 +198,6 @@ enum llm_arch {
|
|
205
198
|
LLM_ARCH_GPTNEOX,
|
206
199
|
LLM_ARCH_MPT,
|
207
200
|
LLM_ARCH_STARCODER,
|
208
|
-
LLM_ARCH_PERSIMMON,
|
209
201
|
LLM_ARCH_REFACT,
|
210
202
|
LLM_ARCH_BERT,
|
211
203
|
LLM_ARCH_NOMIC_BERT,
|
@@ -229,6 +221,8 @@ enum llm_arch {
|
|
229
221
|
LLM_ARCH_COMMAND_R,
|
230
222
|
LLM_ARCH_DBRX,
|
231
223
|
LLM_ARCH_OLMO,
|
224
|
+
LLM_ARCH_ARCTIC,
|
225
|
+
LLM_ARCH_DEEPSEEK2,
|
232
226
|
LLM_ARCH_UNKNOWN,
|
233
227
|
};
|
234
228
|
|
@@ -242,7 +236,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
242
236
|
{ LLM_ARCH_MPT, "mpt" },
|
243
237
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
238
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
-
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
239
|
{ LLM_ARCH_REFACT, "refact" },
|
247
240
|
{ LLM_ARCH_BERT, "bert" },
|
248
241
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
@@ -266,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
266
259
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
260
|
{ LLM_ARCH_DBRX, "dbrx" },
|
268
261
|
{ LLM_ARCH_OLMO, "olmo" },
|
262
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
263
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
269
264
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
270
265
|
};
|
271
266
|
|
@@ -286,11 +281,15 @@ enum llm_kv {
|
|
286
281
|
LLM_KV_CONTEXT_LENGTH,
|
287
282
|
LLM_KV_EMBEDDING_LENGTH,
|
288
283
|
LLM_KV_BLOCK_COUNT,
|
284
|
+
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
289
285
|
LLM_KV_FEED_FORWARD_LENGTH,
|
286
|
+
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
290
287
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
291
288
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
292
289
|
LLM_KV_EXPERT_COUNT,
|
293
290
|
LLM_KV_EXPERT_USED_COUNT,
|
291
|
+
LLM_KV_EXPERT_SHARED_COUNT,
|
292
|
+
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
294
293
|
LLM_KV_POOLING_TYPE,
|
295
294
|
LLM_KV_LOGIT_SCALE,
|
296
295
|
|
@@ -303,14 +302,18 @@ enum llm_kv {
|
|
303
302
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
304
303
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
305
304
|
LLM_KV_ATTENTION_CAUSAL,
|
305
|
+
LLM_KV_ATTENTION_Q_LORA_RANK,
|
306
|
+
LLM_KV_ATTENTION_KV_LORA_RANK,
|
306
307
|
|
307
308
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
308
309
|
LLM_KV_ROPE_FREQ_BASE,
|
309
310
|
LLM_KV_ROPE_SCALE_LINEAR,
|
310
311
|
LLM_KV_ROPE_SCALING_TYPE,
|
311
312
|
LLM_KV_ROPE_SCALING_FACTOR,
|
313
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
312
314
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
313
315
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
316
|
+
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
314
317
|
|
315
318
|
LLM_KV_SPLIT_NO,
|
316
319
|
LLM_KV_SPLIT_COUNT,
|
@@ -359,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
359
362
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
360
363
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
361
364
|
|
362
|
-
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size"
|
363
|
-
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length"
|
364
|
-
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"
|
365
|
-
{ LLM_KV_BLOCK_COUNT, "%s.block_count"
|
366
|
-
{
|
367
|
-
{
|
368
|
-
{
|
369
|
-
{
|
370
|
-
{
|
371
|
-
{
|
372
|
-
{
|
365
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
366
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
367
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
368
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
369
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
370
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
371
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
372
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
373
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
374
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
375
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
376
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
377
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
378
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
379
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
373
380
|
|
374
381
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
375
382
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -380,14 +387,18 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
380
387
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
381
388
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
382
389
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
390
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
391
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
383
392
|
|
384
393
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
385
394
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
386
395
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
387
396
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
388
397
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
398
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
389
399
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
390
400
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
401
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
391
402
|
|
392
403
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
393
404
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
@@ -441,6 +452,8 @@ enum llm_tensor {
|
|
441
452
|
LLM_TENSOR_OUTPUT,
|
442
453
|
LLM_TENSOR_OUTPUT_NORM,
|
443
454
|
LLM_TENSOR_ROPE_FREQS,
|
455
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
456
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
444
457
|
LLM_TENSOR_ATTN_Q,
|
445
458
|
LLM_TENSOR_ATTN_K,
|
446
459
|
LLM_TENSOR_ATTN_V,
|
@@ -460,6 +473,7 @@ enum llm_tensor {
|
|
460
473
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
461
474
|
LLM_TENSOR_FFN_GATE_EXP,
|
462
475
|
LLM_TENSOR_FFN_UP_EXP,
|
476
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
463
477
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
464
478
|
LLM_TENSOR_FFN_GATE_EXPS,
|
465
479
|
LLM_TENSOR_FFN_UP_EXPS,
|
@@ -476,6 +490,12 @@ enum llm_tensor {
|
|
476
490
|
LLM_TENSOR_SSM_A,
|
477
491
|
LLM_TENSOR_SSM_D,
|
478
492
|
LLM_TENSOR_SSM_OUT,
|
493
|
+
LLM_TENSOR_ATTN_Q_A,
|
494
|
+
LLM_TENSOR_ATTN_Q_B,
|
495
|
+
LLM_TENSOR_ATTN_KV_A_MQA,
|
496
|
+
LLM_TENSOR_ATTN_KV_B,
|
497
|
+
LLM_TENSOR_ATTN_Q_A_NORM,
|
498
|
+
LLM_TENSOR_ATTN_KV_A_NORM,
|
479
499
|
};
|
480
500
|
|
481
501
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -598,23 +618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
598
618
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
599
619
|
},
|
600
620
|
},
|
601
|
-
{
|
602
|
-
LLM_ARCH_PERSIMMON,
|
603
|
-
{
|
604
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
605
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
606
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
607
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
608
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
609
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
610
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
611
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
612
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
613
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
614
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
615
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
616
|
-
},
|
617
|
-
},
|
618
621
|
{
|
619
622
|
LLM_ARCH_MPT,
|
620
623
|
{
|
@@ -825,18 +828,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
825
828
|
{
|
826
829
|
LLM_ARCH_PHI3,
|
827
830
|
{
|
828
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
829
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
830
|
-
{ LLM_TENSOR_OUTPUT,
|
831
|
-
{
|
832
|
-
{
|
833
|
-
{
|
834
|
-
{
|
835
|
-
{
|
836
|
-
{
|
837
|
-
{
|
838
|
-
{
|
839
|
-
{
|
831
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
832
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
833
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
834
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
835
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
836
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
837
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
838
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
839
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
840
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
841
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
842
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
843
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
844
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
840
845
|
},
|
841
846
|
},
|
842
847
|
{
|
@@ -1052,6 +1057,57 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1052
1057
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1053
1058
|
},
|
1054
1059
|
},
|
1060
|
+
{
|
1061
|
+
LLM_ARCH_ARCTIC,
|
1062
|
+
{
|
1063
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1064
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1065
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1066
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1067
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1068
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1069
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1070
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1071
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1072
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1073
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1074
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1075
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1076
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
1077
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1078
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1079
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1080
|
+
},
|
1081
|
+
},
|
1082
|
+
{
|
1083
|
+
LLM_ARCH_DEEPSEEK2,
|
1084
|
+
{
|
1085
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1086
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1087
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1088
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1089
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1090
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1091
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1092
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1093
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1094
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1095
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1096
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1097
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1098
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1099
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1100
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1101
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1102
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1103
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1104
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1105
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1106
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1107
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1108
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1109
|
+
},
|
1110
|
+
},
|
1055
1111
|
{
|
1056
1112
|
LLM_ARCH_UNKNOWN,
|
1057
1113
|
{
|
@@ -1646,12 +1702,13 @@ struct llama_mlock {
|
|
1646
1702
|
};
|
1647
1703
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1648
1704
|
|
1649
|
-
|
1705
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
1706
|
+
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
1650
1707
|
std::vector<char> result(8, 0);
|
1651
|
-
const int n_tokens = llama_token_to_piece(
|
1708
|
+
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1652
1709
|
if (n_tokens < 0) {
|
1653
1710
|
result.resize(-n_tokens);
|
1654
|
-
int check = llama_token_to_piece(
|
1711
|
+
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1655
1712
|
GGML_ASSERT(check == -n_tokens);
|
1656
1713
|
}
|
1657
1714
|
else {
|
@@ -1697,6 +1754,8 @@ struct llama_state {
|
|
1697
1754
|
llama_state() {
|
1698
1755
|
#ifdef GGML_USE_METAL
|
1699
1756
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1757
|
+
#elif defined(GGML_USE_CUDA)
|
1758
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
1700
1759
|
#endif
|
1701
1760
|
}
|
1702
1761
|
|
@@ -1710,23 +1769,31 @@ static llama_state g_state;
|
|
1710
1769
|
// available llama models
|
1711
1770
|
enum e_model {
|
1712
1771
|
MODEL_UNKNOWN,
|
1772
|
+
MODEL_14M,
|
1713
1773
|
MODEL_17M,
|
1714
1774
|
MODEL_22M,
|
1715
1775
|
MODEL_33M,
|
1776
|
+
MODEL_70M,
|
1716
1777
|
MODEL_109M,
|
1717
1778
|
MODEL_137M,
|
1779
|
+
MODEL_160M,
|
1718
1780
|
MODEL_335M,
|
1781
|
+
MODEL_410M,
|
1719
1782
|
MODEL_0_5B,
|
1720
1783
|
MODEL_1B,
|
1784
|
+
MODEL_1_4B,
|
1721
1785
|
MODEL_2B,
|
1786
|
+
MODEL_2_8B,
|
1722
1787
|
MODEL_3B,
|
1723
1788
|
MODEL_4B,
|
1789
|
+
MODEL_6_9B,
|
1724
1790
|
MODEL_7B,
|
1725
1791
|
MODEL_8B,
|
1726
1792
|
MODEL_12B,
|
1727
1793
|
MODEL_13B,
|
1728
1794
|
MODEL_14B,
|
1729
1795
|
MODEL_15B,
|
1796
|
+
MODEL_16B,
|
1730
1797
|
MODEL_20B,
|
1731
1798
|
MODEL_30B,
|
1732
1799
|
MODEL_34B,
|
@@ -1734,6 +1801,7 @@ enum e_model {
|
|
1734
1801
|
MODEL_40B,
|
1735
1802
|
MODEL_65B,
|
1736
1803
|
MODEL_70B,
|
1804
|
+
MODEL_236B,
|
1737
1805
|
MODEL_314B,
|
1738
1806
|
MODEL_SMALL,
|
1739
1807
|
MODEL_MEDIUM,
|
@@ -1743,6 +1811,7 @@ enum e_model {
|
|
1743
1811
|
MODEL_8x7B,
|
1744
1812
|
MODEL_8x22B,
|
1745
1813
|
MODEL_16x12B,
|
1814
|
+
MODEL_10B_128x3_66B,
|
1746
1815
|
};
|
1747
1816
|
|
1748
1817
|
static const size_t kiB = 1024;
|
@@ -1752,6 +1821,7 @@ static const size_t GiB = 1024*MiB;
|
|
1752
1821
|
struct llama_hparams {
|
1753
1822
|
bool vocab_only;
|
1754
1823
|
bool rope_finetuned;
|
1824
|
+
bool use_par_res;
|
1755
1825
|
|
1756
1826
|
uint32_t n_vocab;
|
1757
1827
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -1767,12 +1837,21 @@ struct llama_hparams {
|
|
1767
1837
|
uint32_t n_expert_used = 0;
|
1768
1838
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1769
1839
|
|
1840
|
+
uint32_t n_layer_dense_lead = 0;
|
1841
|
+
uint32_t n_lora_q = 0;
|
1842
|
+
uint32_t n_lora_kv = 0;
|
1843
|
+
uint32_t n_ff_exp = 0;
|
1844
|
+
uint32_t n_expert_shared = 0;
|
1845
|
+
float expert_weights_scale = 0.0;
|
1846
|
+
|
1770
1847
|
float f_norm_eps;
|
1771
1848
|
float f_norm_rms_eps;
|
1772
1849
|
|
1850
|
+
float rope_attn_factor = 1.0f;
|
1773
1851
|
float rope_freq_base_train;
|
1774
1852
|
float rope_freq_scale_train;
|
1775
1853
|
uint32_t n_yarn_orig_ctx;
|
1854
|
+
float rope_yarn_log_mul;
|
1776
1855
|
|
1777
1856
|
// for State Space Models
|
1778
1857
|
uint32_t ssm_d_conv = 0;
|
@@ -1806,6 +1885,12 @@ struct llama_hparams {
|
|
1806
1885
|
if (this->n_expert != other.n_expert) return true;
|
1807
1886
|
if (this->n_expert_used != other.n_expert_used) return true;
|
1808
1887
|
|
1888
|
+
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
1889
|
+
if (this->n_lora_q != other.n_lora_q) return true;
|
1890
|
+
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1891
|
+
if (this->n_ff_exp != other.n_ff_exp) return true;
|
1892
|
+
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1893
|
+
|
1809
1894
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1810
1895
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1811
1896
|
|
@@ -1818,8 +1903,11 @@ struct llama_hparams {
|
|
1818
1903
|
|
1819
1904
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1820
1905
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1906
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1821
1907
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1822
1908
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1909
|
+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
1910
|
+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
1823
1911
|
|
1824
1912
|
return false;
|
1825
1913
|
}
|
@@ -1895,6 +1983,8 @@ struct llama_layer {
|
|
1895
1983
|
struct ggml_tensor * attn_k_norm_b;
|
1896
1984
|
struct ggml_tensor * attn_out_norm;
|
1897
1985
|
struct ggml_tensor * attn_out_norm_b;
|
1986
|
+
struct ggml_tensor * attn_q_a_norm;
|
1987
|
+
struct ggml_tensor * attn_kv_a_norm;
|
1898
1988
|
|
1899
1989
|
// attention
|
1900
1990
|
struct ggml_tensor * wq;
|
@@ -1902,6 +1992,10 @@ struct llama_layer {
|
|
1902
1992
|
struct ggml_tensor * wv;
|
1903
1993
|
struct ggml_tensor * wo;
|
1904
1994
|
struct ggml_tensor * wqkv;
|
1995
|
+
struct ggml_tensor * wq_a;
|
1996
|
+
struct ggml_tensor * wq_b;
|
1997
|
+
struct ggml_tensor * wkv_a_mqa;
|
1998
|
+
struct ggml_tensor * wkv_b;
|
1905
1999
|
|
1906
2000
|
// attention bias
|
1907
2001
|
struct ggml_tensor * bq;
|
@@ -1915,6 +2009,7 @@ struct llama_layer {
|
|
1915
2009
|
struct ggml_tensor * ffn_norm_b;
|
1916
2010
|
struct ggml_tensor * layer_out_norm;
|
1917
2011
|
struct ggml_tensor * layer_out_norm_b;
|
2012
|
+
struct ggml_tensor * ffn_norm_exps;
|
1918
2013
|
|
1919
2014
|
// ff
|
1920
2015
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1934,8 +2029,9 @@ struct llama_layer {
|
|
1934
2029
|
struct ggml_tensor * ffn_up_shexp;
|
1935
2030
|
|
1936
2031
|
// ff bias
|
1937
|
-
struct ggml_tensor *
|
1938
|
-
struct ggml_tensor *
|
2032
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
2033
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
2034
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
1939
2035
|
struct ggml_tensor * ffn_act;
|
1940
2036
|
|
1941
2037
|
// mamba proj
|
@@ -1952,6 +2048,10 @@ struct llama_layer {
|
|
1952
2048
|
// mamba bias
|
1953
2049
|
struct ggml_tensor * ssm_conv1d_b;
|
1954
2050
|
struct ggml_tensor * ssm_dt_b;
|
2051
|
+
|
2052
|
+
// long rope factors
|
2053
|
+
struct ggml_tensor * rope_long = nullptr;
|
2054
|
+
struct ggml_tensor * rope_short = nullptr;
|
1955
2055
|
};
|
1956
2056
|
|
1957
2057
|
struct llama_kv_cell {
|
@@ -2063,7 +2163,9 @@ struct llama_vocab {
|
|
2063
2163
|
std::unordered_map<token, id> token_to_id;
|
2064
2164
|
std::vector<token_data> id_to_token;
|
2065
2165
|
|
2066
|
-
std::
|
2166
|
+
std::vector<id> cache_special_tokens;
|
2167
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
2168
|
+
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
2067
2169
|
|
2068
2170
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2069
2171
|
|
@@ -2268,10 +2370,6 @@ struct llama_context {
|
|
2268
2370
|
|
2269
2371
|
// control vectors
|
2270
2372
|
struct llama_control_vector cvec;
|
2271
|
-
|
2272
|
-
#ifdef GGML_USE_MPI
|
2273
|
-
ggml_mpi_context * ctx_mpi = NULL;
|
2274
|
-
#endif
|
2275
2373
|
};
|
2276
2374
|
|
2277
2375
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
@@ -2491,7 +2589,6 @@ static bool llama_kv_cache_init(
|
|
2491
2589
|
static bool llama_kv_cache_find_slot(
|
2492
2590
|
struct llama_kv_cache & cache,
|
2493
2591
|
const struct llama_batch & batch) {
|
2494
|
-
const uint32_t n_ctx = cache.size;
|
2495
2592
|
const uint32_t n_tokens = batch.n_tokens;
|
2496
2593
|
|
2497
2594
|
if (cache.recurrent) {
|
@@ -2542,16 +2639,16 @@ static bool llama_kv_cache_find_slot(
|
|
2542
2639
|
}
|
2543
2640
|
// otherwise, one cell per token.
|
2544
2641
|
|
2545
|
-
if (n_tokens >
|
2546
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
2642
|
+
if (n_tokens > cache.size) {
|
2643
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
2547
2644
|
return false;
|
2548
2645
|
}
|
2549
2646
|
|
2550
2647
|
uint32_t n_tested = 0;
|
2551
2648
|
|
2552
2649
|
while (true) {
|
2553
|
-
if (cache.head + n_tokens >
|
2554
|
-
n_tested +=
|
2650
|
+
if (cache.head + n_tokens > cache.size) {
|
2651
|
+
n_tested += cache.size - cache.head;
|
2555
2652
|
cache.head = 0;
|
2556
2653
|
continue;
|
2557
2654
|
}
|
@@ -2570,7 +2667,7 @@ static bool llama_kv_cache_find_slot(
|
|
2570
2667
|
break;
|
2571
2668
|
}
|
2572
2669
|
|
2573
|
-
if (n_tested >=
|
2670
|
+
if (n_tested >= cache.size) {
|
2574
2671
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
2575
2672
|
return false;
|
2576
2673
|
}
|
@@ -3330,6 +3427,39 @@ struct llama_model_loader {
|
|
3330
3427
|
return get_arr_n(llm_kv(kid), result, required);
|
3331
3428
|
}
|
3332
3429
|
|
3430
|
+
template<typename T>
|
3431
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
3432
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
3433
|
+
|
3434
|
+
if (kid < 0) {
|
3435
|
+
if (required) {
|
3436
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3437
|
+
}
|
3438
|
+
return false;
|
3439
|
+
}
|
3440
|
+
|
3441
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
3442
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
3443
|
+
|
3444
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
3445
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
3446
|
+
}
|
3447
|
+
|
3448
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
3449
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
3450
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
3451
|
+
|
3452
|
+
result.resize(arr_info.length);
|
3453
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
3454
|
+
|
3455
|
+
return true;
|
3456
|
+
}
|
3457
|
+
|
3458
|
+
template<typename T>
|
3459
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
3460
|
+
return get_arr(llm_kv(kid), result, required);
|
3461
|
+
}
|
3462
|
+
|
3333
3463
|
template<typename T>
|
3334
3464
|
bool get_key(const std::string & key, T & result, const bool required = true) {
|
3335
3465
|
auto it = kv_overrides.find(key);
|
@@ -3404,11 +3534,15 @@ struct llama_model_loader {
|
|
3404
3534
|
return get_tensor_meta(get_tensor_name(i));
|
3405
3535
|
}
|
3406
3536
|
|
3407
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3537
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
3408
3538
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3409
3539
|
ggml_set_name(tensor, ggml_get_name(cur));
|
3410
3540
|
|
3411
|
-
|
3541
|
+
if (duplicated) {
|
3542
|
+
size_data += ggml_nbytes(cur);
|
3543
|
+
} else {
|
3544
|
+
n_created++;
|
3545
|
+
}
|
3412
3546
|
|
3413
3547
|
return tensor;
|
3414
3548
|
}
|
@@ -3443,14 +3577,17 @@ struct llama_model_loader {
|
|
3443
3577
|
return cur;
|
3444
3578
|
}
|
3445
3579
|
|
3446
|
-
|
3447
|
-
|
3580
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
3581
|
+
static const int TENSOR_DUPLICATED = 2;
|
3582
|
+
|
3583
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
3584
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
3448
3585
|
|
3449
3586
|
if (cur == NULL) {
|
3450
3587
|
return NULL;
|
3451
3588
|
}
|
3452
3589
|
|
3453
|
-
return create_tensor_for(ctx, cur);
|
3590
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
3454
3591
|
}
|
3455
3592
|
|
3456
3593
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
@@ -3750,37 +3887,50 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3750
3887
|
|
3751
3888
|
static const char * llama_model_type_name(e_model type) {
|
3752
3889
|
switch (type) {
|
3753
|
-
case
|
3754
|
-
case
|
3755
|
-
case
|
3756
|
-
case
|
3757
|
-
case
|
3758
|
-
case
|
3759
|
-
case
|
3760
|
-
case
|
3761
|
-
case
|
3762
|
-
case
|
3763
|
-
case
|
3764
|
-
case
|
3765
|
-
case
|
3766
|
-
case
|
3767
|
-
case
|
3768
|
-
case
|
3769
|
-
case
|
3770
|
-
case
|
3771
|
-
case
|
3772
|
-
case
|
3773
|
-
case
|
3774
|
-
case
|
3775
|
-
case
|
3776
|
-
case
|
3777
|
-
case
|
3778
|
-
case
|
3779
|
-
case
|
3780
|
-
case
|
3781
|
-
case
|
3782
|
-
case
|
3783
|
-
|
3890
|
+
case MODEL_14M: return "14M";
|
3891
|
+
case MODEL_17M: return "17M";
|
3892
|
+
case MODEL_22M: return "22M";
|
3893
|
+
case MODEL_33M: return "33M";
|
3894
|
+
case MODEL_70M: return "70M";
|
3895
|
+
case MODEL_109M: return "109M";
|
3896
|
+
case MODEL_137M: return "137M";
|
3897
|
+
case MODEL_160M: return "160M";
|
3898
|
+
case MODEL_335M: return "335M";
|
3899
|
+
case MODEL_410M: return "410M";
|
3900
|
+
case MODEL_0_5B: return "0.5B";
|
3901
|
+
case MODEL_1B: return "1B";
|
3902
|
+
case MODEL_1_4B: return "1.4B";
|
3903
|
+
case MODEL_2B: return "2B";
|
3904
|
+
case MODEL_2_8B: return "2.8B";
|
3905
|
+
case MODEL_3B: return "3B";
|
3906
|
+
case MODEL_4B: return "4B";
|
3907
|
+
case MODEL_6_9B: return "6.9B";
|
3908
|
+
case MODEL_7B: return "7B";
|
3909
|
+
case MODEL_8B: return "8B";
|
3910
|
+
case MODEL_12B: return "12B";
|
3911
|
+
case MODEL_13B: return "13B";
|
3912
|
+
case MODEL_14B: return "14B";
|
3913
|
+
case MODEL_15B: return "15B";
|
3914
|
+
case MODEL_16B: return "16B";
|
3915
|
+
case MODEL_20B: return "20B";
|
3916
|
+
case MODEL_30B: return "30B";
|
3917
|
+
case MODEL_34B: return "34B";
|
3918
|
+
case MODEL_35B: return "35B";
|
3919
|
+
case MODEL_40B: return "40B";
|
3920
|
+
case MODEL_65B: return "65B";
|
3921
|
+
case MODEL_70B: return "70B";
|
3922
|
+
case MODEL_236B: return "236B";
|
3923
|
+
case MODEL_314B: return "314B";
|
3924
|
+
case MODEL_SMALL: return "0.1B";
|
3925
|
+
case MODEL_MEDIUM: return "0.4B";
|
3926
|
+
case MODEL_LARGE: return "0.8B";
|
3927
|
+
case MODEL_XL: return "1.5B";
|
3928
|
+
case MODEL_A2_7B: return "A2.7B";
|
3929
|
+
case MODEL_8x7B: return "8x7B";
|
3930
|
+
case MODEL_8x22B: return "8x22B";
|
3931
|
+
case MODEL_16x12B: return "16x12B";
|
3932
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
3933
|
+
default: return "?B";
|
3784
3934
|
}
|
3785
3935
|
}
|
3786
3936
|
|
@@ -3873,6 +4023,8 @@ static void llm_load_hparams(
|
|
3873
4023
|
}
|
3874
4024
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
3875
4025
|
|
4026
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
4027
|
+
|
3876
4028
|
// sanity check for n_rot (optional)
|
3877
4029
|
{
|
3878
4030
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
@@ -3910,7 +4062,9 @@ static void llm_load_hparams(
|
|
3910
4062
|
switch (hparams.n_layer) {
|
3911
4063
|
case 22: model.type = e_model::MODEL_1B; break;
|
3912
4064
|
case 26: model.type = e_model::MODEL_3B; break;
|
3913
|
-
|
4065
|
+
// granite uses a vocab with len 49152
|
4066
|
+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
4067
|
+
case 36: model.type = e_model::MODEL_8B; break; // granite
|
3914
4068
|
case 40: model.type = e_model::MODEL_13B; break;
|
3915
4069
|
case 48: model.type = e_model::MODEL_34B; break;
|
3916
4070
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3972,14 +4126,6 @@ static void llm_load_hparams(
|
|
3972
4126
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3973
4127
|
}
|
3974
4128
|
} break;
|
3975
|
-
case LLM_ARCH_PERSIMMON:
|
3976
|
-
{
|
3977
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3978
|
-
switch (hparams.n_layer) {
|
3979
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
3980
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
3981
|
-
}
|
3982
|
-
} break;
|
3983
4129
|
case LLM_ARCH_REFACT:
|
3984
4130
|
{
|
3985
4131
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -4121,6 +4267,7 @@ static void llm_load_hparams(
|
|
4121
4267
|
switch (hparams.n_layer) {
|
4122
4268
|
case 24: model.type = e_model::MODEL_1B; break;
|
4123
4269
|
case 32: model.type = e_model::MODEL_3B; break;
|
4270
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
4124
4271
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4125
4272
|
}
|
4126
4273
|
} break;
|
@@ -4187,6 +4334,8 @@ static void llm_load_hparams(
|
|
4187
4334
|
case 30: model.type = e_model::MODEL_3B; break;
|
4188
4335
|
case 32: model.type = e_model::MODEL_7B; break;
|
4189
4336
|
case 40: model.type = e_model::MODEL_15B; break;
|
4337
|
+
case 52: model.type = e_model::MODEL_20B; break; // granite
|
4338
|
+
case 88: model.type = e_model::MODEL_34B; break; // granite
|
4190
4339
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4191
4340
|
}
|
4192
4341
|
} break;
|
@@ -4261,6 +4410,85 @@ static void llm_load_hparams(
|
|
4261
4410
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4262
4411
|
}
|
4263
4412
|
} break;
|
4413
|
+
case LLM_ARCH_GPTNEOX:
|
4414
|
+
{
|
4415
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4416
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
4417
|
+
switch (hparams.n_layer) {
|
4418
|
+
case 6:
|
4419
|
+
switch (hparams.n_ff) {
|
4420
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
4421
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
4422
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4423
|
+
} break;
|
4424
|
+
case 12:
|
4425
|
+
switch (hparams.n_ff) {
|
4426
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
4427
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4428
|
+
} break;
|
4429
|
+
case 16:
|
4430
|
+
switch (hparams.n_ff) {
|
4431
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
4432
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4433
|
+
} break;
|
4434
|
+
case 24:
|
4435
|
+
switch (hparams.n_ff) {
|
4436
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
4437
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
4438
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4439
|
+
} break;
|
4440
|
+
case 32:
|
4441
|
+
switch (hparams.n_ff) {
|
4442
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
4443
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
4444
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4445
|
+
} break;
|
4446
|
+
case 36:
|
4447
|
+
switch (hparams.n_ff) {
|
4448
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
4449
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4450
|
+
} break;
|
4451
|
+
case 44:
|
4452
|
+
switch (hparams.n_ff) {
|
4453
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
4454
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4455
|
+
} break;
|
4456
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4457
|
+
}
|
4458
|
+
} break;
|
4459
|
+
case LLM_ARCH_ARCTIC:
|
4460
|
+
{
|
4461
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4462
|
+
|
4463
|
+
if (hparams.n_expert == 128) {
|
4464
|
+
switch (hparams.n_layer) {
|
4465
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
4466
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4467
|
+
}
|
4468
|
+
} else {
|
4469
|
+
model.type = e_model::MODEL_UNKNOWN;
|
4470
|
+
}
|
4471
|
+
} break;
|
4472
|
+
case LLM_ARCH_DEEPSEEK2:
|
4473
|
+
{
|
4474
|
+
bool is_lite = (hparams.n_layer == 27);
|
4475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
4477
|
+
if (!is_lite) {
|
4478
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
4479
|
+
}
|
4480
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
4481
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
4482
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
4483
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
4484
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
4485
|
+
|
4486
|
+
switch (hparams.n_layer) {
|
4487
|
+
case 27: model.type = e_model::MODEL_16B; break;
|
4488
|
+
case 60: model.type = e_model::MODEL_236B; break;
|
4489
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4490
|
+
}
|
4491
|
+
} break;
|
4264
4492
|
default: (void)0;
|
4265
4493
|
}
|
4266
4494
|
|
@@ -4367,15 +4595,14 @@ static void llm_load_vocab(
|
|
4367
4595
|
vocab.special_cls_id = 101;
|
4368
4596
|
vocab.special_mask_id = 103;
|
4369
4597
|
vocab.add_space_prefix = false;
|
4370
|
-
} else {
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4374
|
-
|
4375
|
-
|
4376
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4377
|
-
return;
|
4598
|
+
} else if (tokenizer_model == "gpt2") {
|
4599
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4600
|
+
|
4601
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4602
|
+
if (add_space_prefix_keyidx != -1) {
|
4603
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4378
4604
|
}
|
4605
|
+
|
4379
4606
|
// read bpe merges and populate bpe ranks
|
4380
4607
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4381
4608
|
if (merges_keyidx == -1) {
|
@@ -4409,6 +4636,8 @@ static void llm_load_vocab(
|
|
4409
4636
|
vocab.special_pad_id = -1;
|
4410
4637
|
vocab.special_cls_id = -1;
|
4411
4638
|
vocab.special_mask_id = -1;
|
4639
|
+
} else {
|
4640
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
4412
4641
|
}
|
4413
4642
|
|
4414
4643
|
// for now, only BPE models have pre-tokenizers
|
@@ -4461,12 +4690,18 @@ static void llm_load_vocab(
|
|
4461
4690
|
} else if (
|
4462
4691
|
tokenizer_pre == "qwen2") {
|
4463
4692
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4693
|
+
} else if (
|
4694
|
+
tokenizer_pre == "stablelm2") {
|
4695
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
4464
4696
|
} else if (
|
4465
4697
|
tokenizer_pre == "olmo") {
|
4466
4698
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
4467
4699
|
} else if (
|
4468
4700
|
tokenizer_pre == "dbrx") {
|
4469
4701
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4702
|
+
} else if (
|
4703
|
+
tokenizer_pre == "smaug-bpe") {
|
4704
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4470
4705
|
} else {
|
4471
4706
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4472
4707
|
}
|
@@ -4582,7 +4817,8 @@ static void llm_load_vocab(
|
|
4582
4817
|
(t.first == "<|eot_id|>" ||
|
4583
4818
|
t.first == "<|im_end|>" ||
|
4584
4819
|
t.first == "<|end|>" ||
|
4585
|
-
t.first == "<end_of_turn>"
|
4820
|
+
t.first == "<end_of_turn>" ||
|
4821
|
+
t.first == "<|endoftext|>"
|
4586
4822
|
)
|
4587
4823
|
) {
|
4588
4824
|
vocab.special_eot_id = t.second;
|
@@ -4594,97 +4830,40 @@ static void llm_load_vocab(
|
|
4594
4830
|
|
4595
4831
|
// build special tokens cache
|
4596
4832
|
{
|
4597
|
-
|
4598
|
-
// and will always be correctly labeled in 'added_tokens.json' etc.
|
4599
|
-
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
4600
|
-
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
4601
|
-
// are special tokens.
|
4602
|
-
// From testing, this appears to correlate 1:1 with special tokens.
|
4603
|
-
//
|
4604
|
-
|
4605
|
-
// Counting special tokens and verifying in only one direction
|
4606
|
-
// is sufficient to detect difference in those two sets.
|
4607
|
-
//
|
4608
|
-
uint32_t special_tokens_count_by_type = 0;
|
4609
|
-
uint32_t special_tokens_count_from_verification = 0;
|
4610
|
-
|
4611
|
-
bool special_tokens_definition_mismatch = false;
|
4612
|
-
|
4613
|
-
for (const auto & t : vocab.token_to_id) {
|
4614
|
-
const auto & token = t.first;
|
4615
|
-
const auto & id = t.second;
|
4616
|
-
|
4617
|
-
// Count all non-normal tokens in the vocab while iterating
|
4833
|
+
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
4618
4834
|
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
4619
|
-
|
4835
|
+
vocab.cache_special_tokens.push_back(id);
|
4620
4836
|
}
|
4837
|
+
}
|
4621
4838
|
|
4622
|
-
|
4623
|
-
|
4624
|
-
|
4625
|
-
|
4626
|
-
|
4627
|
-
// and check if both halves can be matched to a valid token
|
4628
|
-
for (unsigned i = 1; i < token.length();) {
|
4629
|
-
const auto left = token.substr(0, i);
|
4630
|
-
const auto right = token.substr(i);
|
4631
|
-
|
4632
|
-
// check if we didnt partition in the middle of a utf sequence
|
4633
|
-
auto utf = utf8_len(left.at(left.length() - 1));
|
4634
|
-
|
4635
|
-
if (utf == 1) {
|
4636
|
-
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
4637
|
-
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
4638
|
-
is_tokenizable = true;
|
4639
|
-
break;
|
4640
|
-
}
|
4641
|
-
i++;
|
4642
|
-
} else {
|
4643
|
-
// skip over the rest of multibyte utf sequence
|
4644
|
-
i += utf - 1;
|
4645
|
-
}
|
4646
|
-
}
|
4839
|
+
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
4840
|
+
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
4841
|
+
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
4842
|
+
}
|
4843
|
+
);
|
4647
4844
|
|
4648
|
-
|
4649
|
-
|
4650
|
-
// it's faster to re-filter them here, since there are way less candidates now
|
4845
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
4846
|
+
}
|
4651
4847
|
|
4652
|
-
|
4653
|
-
|
4654
|
-
|
4655
|
-
utf8_str_len++;
|
4656
|
-
i += utf8_len(token.at(i));
|
4657
|
-
}
|
4848
|
+
// build token to piece caches
|
4849
|
+
{
|
4850
|
+
size_t size_cache = 0;
|
4658
4851
|
|
4659
|
-
|
4660
|
-
|
4661
|
-
// At this point what we have left are special tokens only
|
4662
|
-
vocab.special_tokens_cache[token] = id;
|
4852
|
+
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
4853
|
+
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
4663
4854
|
|
4664
|
-
|
4665
|
-
|
4855
|
+
for (uint32_t id = 0; id < n_vocab; ++id) {
|
4856
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
4857
|
+
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
4666
4858
|
|
4667
|
-
|
4668
|
-
|
4669
|
-
special_tokens_definition_mismatch = true;
|
4670
|
-
}
|
4671
|
-
}
|
4672
|
-
}
|
4673
|
-
}
|
4859
|
+
size_cache += cache_token_to_piece[id].size();
|
4860
|
+
size_cache += cache_token_to_piece_special[id].size();
|
4674
4861
|
}
|
4675
4862
|
|
4676
|
-
|
4677
|
-
|
4678
|
-
|
4679
|
-
|
4680
|
-
special_tokens_count_by_type, vocab.id_to_token.size()
|
4681
|
-
);
|
4682
|
-
} else {
|
4683
|
-
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
4684
|
-
__func__,
|
4685
|
-
special_tokens_count_from_verification, vocab.id_to_token.size()
|
4686
|
-
);
|
4687
|
-
}
|
4863
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
4864
|
+
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
4865
|
+
|
4866
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
4688
4867
|
}
|
4689
4868
|
}
|
4690
4869
|
|
@@ -4765,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4765
4944
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4766
4945
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4767
4946
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4947
|
+
|
4948
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
4949
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
4950
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
4951
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
4952
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4953
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
4954
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
4955
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
4956
|
+
}
|
4768
4957
|
}
|
4769
4958
|
|
4770
4959
|
// Returns false if cancelled by progress_callback
|
@@ -4908,6 +5097,7 @@ static bool llm_load_tensors(
|
|
4908
5097
|
// create tensors for the weights
|
4909
5098
|
{
|
4910
5099
|
const int64_t n_embd = hparams.n_embd;
|
5100
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
4911
5101
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4912
5102
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4913
5103
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -4920,8 +5110,6 @@ static bool llm_load_tensors(
|
|
4920
5110
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4921
5111
|
}
|
4922
5112
|
|
4923
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4924
|
-
|
4925
5113
|
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
4926
5114
|
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
4927
5115
|
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
@@ -4942,12 +5130,10 @@ static bool llm_load_tensors(
|
|
4942
5130
|
{
|
4943
5131
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4944
5132
|
if (model.arch != LLM_ARCH_MINICPM){
|
4945
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5133
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4946
5134
|
// if output is NULL, init from the input tok embed
|
4947
5135
|
if (model.output == NULL) {
|
4948
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4949
|
-
ml.n_created--; // artificial tensor
|
4950
|
-
ml.size_data += ggml_nbytes(model.output);
|
5136
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4951
5137
|
}
|
4952
5138
|
}
|
4953
5139
|
}
|
@@ -4966,10 +5152,10 @@ static bool llm_load_tensors(
|
|
4966
5152
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4967
5153
|
|
4968
5154
|
// optional bias tensors
|
4969
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
4970
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
4971
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
4972
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5155
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5156
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5157
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5158
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4973
5159
|
|
4974
5160
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4975
5161
|
|
@@ -4977,10 +5163,15 @@ static bool llm_load_tensors(
|
|
4977
5163
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4978
5164
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4979
5165
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5166
|
+
|
5167
|
+
// optional MLP bias
|
5168
|
+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5169
|
+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4980
5171
|
} else {
|
4981
5172
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4982
5173
|
|
4983
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5174
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4984
5175
|
if (layer.ffn_gate_exps) {
|
4985
5176
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4986
5177
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5022,12 +5213,10 @@ static bool llm_load_tensors(
|
|
5022
5213
|
// output
|
5023
5214
|
{
|
5024
5215
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5025
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5216
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5026
5217
|
// if output is NULL, init from the input tok embed
|
5027
5218
|
if (model.output == NULL) {
|
5028
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5029
|
-
ml.n_created--; // artificial tensor
|
5030
|
-
ml.size_data += ggml_nbytes(model.output);
|
5219
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5031
5220
|
}
|
5032
5221
|
}
|
5033
5222
|
|
@@ -5050,7 +5239,7 @@ static bool llm_load_tensors(
|
|
5050
5239
|
|
5051
5240
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5052
5241
|
|
5053
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5242
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5054
5243
|
if (layer.ffn_gate_exps) {
|
5055
5244
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
5056
5245
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5152,11 +5341,9 @@ static bool llm_load_tensors(
|
|
5152
5341
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5153
5342
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5154
5343
|
|
5155
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5344
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5156
5345
|
if (!model.output) {
|
5157
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5158
|
-
ml.n_created--; // artificial tensor
|
5159
|
-
ml.size_data += ggml_nbytes(model.output);
|
5346
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5160
5347
|
}
|
5161
5348
|
}
|
5162
5349
|
|
@@ -5169,8 +5356,8 @@ static bool llm_load_tensors(
|
|
5169
5356
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5170
5357
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5171
5358
|
|
5172
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
5173
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
5359
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5360
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5174
5361
|
|
5175
5362
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5176
5363
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -5188,7 +5375,12 @@ static bool llm_load_tensors(
|
|
5188
5375
|
{
|
5189
5376
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5190
5377
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5191
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5378
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5379
|
+
if (!model.output) {
|
5380
|
+
// needs to be on GPU
|
5381
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5382
|
+
}
|
5383
|
+
|
5192
5384
|
}
|
5193
5385
|
|
5194
5386
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5216,47 +5408,6 @@ static bool llm_load_tensors(
|
|
5216
5408
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5217
5409
|
}
|
5218
5410
|
} break;
|
5219
|
-
case LLM_ARCH_PERSIMMON:
|
5220
|
-
{
|
5221
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5222
|
-
|
5223
|
-
{
|
5224
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5225
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5226
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5227
|
-
}
|
5228
|
-
|
5229
|
-
for (int i = 0; i < n_layer; ++i) {
|
5230
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
5231
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5232
|
-
|
5233
|
-
auto & layer = model.layers[i];
|
5234
|
-
|
5235
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5236
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5237
|
-
|
5238
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5239
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
5240
|
-
|
5241
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5242
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
5243
|
-
|
5244
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5245
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5246
|
-
|
5247
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5248
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5249
|
-
|
5250
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5251
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
5252
|
-
|
5253
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
5254
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
5255
|
-
|
5256
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
5257
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
5258
|
-
}
|
5259
|
-
} break;
|
5260
5411
|
case LLM_ARCH_BERT:
|
5261
5412
|
case LLM_ARCH_NOMIC_BERT:
|
5262
5413
|
{
|
@@ -5325,14 +5476,14 @@ static bool llm_load_tensors(
|
|
5325
5476
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
5477
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
5478
|
|
5328
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5329
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5479
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5480
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5330
5481
|
|
5331
5482
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
5483
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
5484
|
|
5334
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5335
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5485
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5486
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5336
5487
|
|
5337
5488
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
5489
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
@@ -5394,18 +5545,16 @@ static bool llm_load_tensors(
|
|
5394
5545
|
case LLM_ARCH_MPT:
|
5395
5546
|
{
|
5396
5547
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5397
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
5548
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5398
5549
|
|
5399
5550
|
// output
|
5400
5551
|
{
|
5401
5552
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5402
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
5553
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5403
5554
|
|
5404
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5555
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5405
5556
|
if (!model.output) {
|
5406
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5407
|
-
ml.n_created--; // artificial tensor
|
5408
|
-
ml.size_data += ggml_nbytes(model.output);
|
5557
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5409
5558
|
}
|
5410
5559
|
}
|
5411
5560
|
|
@@ -5416,31 +5565,31 @@ static bool llm_load_tensors(
|
|
5416
5565
|
auto & layer = model.layers[i];
|
5417
5566
|
|
5418
5567
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5419
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
5568
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5420
5569
|
|
5421
5570
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5422
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5571
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5423
5572
|
|
5424
5573
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5425
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5574
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5426
5575
|
|
5427
5576
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5428
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5577
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5429
5578
|
|
5430
5579
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5431
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
5580
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5432
5581
|
|
5433
5582
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5434
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
5583
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5435
5584
|
|
5436
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5437
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5585
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5586
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5438
5587
|
|
5439
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5440
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5588
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5589
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5441
5590
|
|
5442
5591
|
// AWQ ScaleActivation layer
|
5443
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
5592
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5444
5593
|
}
|
5445
5594
|
} break;
|
5446
5595
|
case LLM_ARCH_STABLELM:
|
@@ -5469,17 +5618,17 @@ static bool llm_load_tensors(
|
|
5469
5618
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5470
5619
|
|
5471
5620
|
// optional bias tensors, present in Stable LM 2 1.6B
|
5472
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
5473
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
5474
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
5621
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5622
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5623
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5475
5624
|
|
5476
5625
|
// optional q and k layernorms, present in StableLM 2 12B
|
5477
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
5478
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
5626
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5627
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5479
5628
|
|
5480
5629
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5481
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
5482
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5630
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5631
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5483
5632
|
|
5484
5633
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5485
5634
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5522,12 +5671,10 @@ static bool llm_load_tensors(
|
|
5522
5671
|
// output
|
5523
5672
|
{
|
5524
5673
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5525
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5674
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5526
5675
|
// if output is NULL, init from the input tok embed
|
5527
5676
|
if (model.output == NULL) {
|
5528
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5529
|
-
ml.n_created--; // artificial tensor
|
5530
|
-
ml.size_data += ggml_nbytes(model.output);
|
5677
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5531
5678
|
}
|
5532
5679
|
}
|
5533
5680
|
|
@@ -5625,8 +5772,8 @@ static bool llm_load_tensors(
|
|
5625
5772
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5626
5773
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5627
5774
|
|
5628
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
5629
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5775
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5776
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5630
5777
|
|
5631
5778
|
if (layer.wqkv == nullptr) {
|
5632
5779
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
@@ -5663,17 +5810,20 @@ static bool llm_load_tensors(
|
|
5663
5810
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
5664
5811
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5665
5812
|
|
5666
|
-
auto& layer = model.layers[i];
|
5813
|
+
auto & layer = model.layers[i];
|
5667
5814
|
|
5668
5815
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5669
5816
|
|
5670
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
5671
|
-
layer.wo
|
5817
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5818
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5672
5819
|
|
5673
5820
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5674
5821
|
|
5675
5822
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5676
5823
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5824
|
+
|
5825
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5826
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5677
5827
|
}
|
5678
5828
|
} break;
|
5679
5829
|
case LLM_ARCH_PLAMO:
|
@@ -5842,9 +5992,7 @@ static bool llm_load_tensors(
|
|
5842
5992
|
|
5843
5993
|
// output
|
5844
5994
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5845
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
5846
|
-
ml.n_created--; // artificial tensor
|
5847
|
-
ml.size_data += ggml_nbytes(model.output);
|
5995
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
5848
5996
|
|
5849
5997
|
const int64_t n_ff = hparams.n_ff;
|
5850
5998
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -5879,12 +6027,10 @@ static bool llm_load_tensors(
|
|
5879
6027
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5880
6028
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5881
6029
|
|
5882
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6030
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5883
6031
|
// if output is NULL, init from the input tok embed
|
5884
6032
|
if (model.output == NULL) {
|
5885
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5886
|
-
ml.n_created--; // artificial tensor
|
5887
|
-
ml.size_data += ggml_nbytes(model.output);
|
6033
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5888
6034
|
}
|
5889
6035
|
|
5890
6036
|
}
|
@@ -5935,12 +6081,10 @@ static bool llm_load_tensors(
|
|
5935
6081
|
{
|
5936
6082
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5937
6083
|
|
5938
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6084
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5939
6085
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
5940
6086
|
if (model.output == NULL) {
|
5941
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5942
|
-
ml.n_created--; // artificial tensor
|
5943
|
-
ml.size_data += ggml_nbytes(model.output);
|
6087
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5944
6088
|
}
|
5945
6089
|
}
|
5946
6090
|
|
@@ -6001,9 +6145,7 @@ static bool llm_load_tensors(
|
|
6001
6145
|
{
|
6002
6146
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6003
6147
|
// init output from the input tok embed
|
6004
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6005
|
-
ml.n_created--; // artificial tensor
|
6006
|
-
ml.size_data += ggml_nbytes(model.output);
|
6148
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6007
6149
|
}
|
6008
6150
|
|
6009
6151
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -6035,12 +6177,10 @@ static bool llm_load_tensors(
|
|
6035
6177
|
|
6036
6178
|
// output
|
6037
6179
|
{
|
6038
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6180
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6039
6181
|
// if output is NULL, init from the input tok embed
|
6040
6182
|
if (model.output == NULL) {
|
6041
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6042
|
-
ml.n_created--; // artificial tensor
|
6043
|
-
ml.size_data += ggml_nbytes(model.output);
|
6183
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6044
6184
|
}
|
6045
6185
|
}
|
6046
6186
|
|
@@ -6060,30 +6200,169 @@ static bool llm_load_tensors(
|
|
6060
6200
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6061
6201
|
}
|
6062
6202
|
} break;
|
6063
|
-
|
6064
|
-
|
6065
|
-
|
6066
|
-
|
6203
|
+
case LLM_ARCH_GPTNEOX:
|
6204
|
+
{
|
6205
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6206
|
+
// output
|
6207
|
+
{
|
6208
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6209
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
6210
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6211
|
+
}
|
6067
6212
|
|
6068
|
-
|
6213
|
+
for (int i = 0; i < n_layer; ++i) {
|
6214
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6215
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6069
6216
|
|
6070
|
-
|
6071
|
-
model.mappings.reserve(ml.mappings.size());
|
6217
|
+
auto & layer = model.layers[i];
|
6072
6218
|
|
6073
|
-
|
6074
|
-
|
6075
|
-
ctx_bufs.reserve(ctx_map.size());
|
6219
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6220
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
6076
6221
|
|
6077
|
-
|
6078
|
-
|
6079
|
-
model.bufs.reserve(n_max_backend_buffer);
|
6222
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
6223
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
6080
6224
|
|
6081
|
-
|
6082
|
-
|
6083
|
-
ggml_context * ctx = it.second;
|
6225
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6226
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
6084
6227
|
|
6085
|
-
|
6086
|
-
|
6228
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6229
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
6230
|
+
|
6231
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
6232
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
6233
|
+
|
6234
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6235
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
6236
|
+
}
|
6237
|
+
} break;
|
6238
|
+
case LLM_ARCH_ARCTIC:
|
6239
|
+
{
|
6240
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6241
|
+
|
6242
|
+
// output
|
6243
|
+
{
|
6244
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6245
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6246
|
+
// if output is NULL, init from the input tok embed
|
6247
|
+
if (model.output == NULL) {
|
6248
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6249
|
+
}
|
6250
|
+
}
|
6251
|
+
|
6252
|
+
for (int i = 0; i < n_layer; ++i) {
|
6253
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6254
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6255
|
+
|
6256
|
+
auto & layer = model.layers[i];
|
6257
|
+
|
6258
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6259
|
+
|
6260
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
6261
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
6262
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
6263
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6264
|
+
|
6265
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6266
|
+
|
6267
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
6268
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
6269
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
6270
|
+
|
6271
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6272
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
6273
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
6274
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
6275
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6276
|
+
}
|
6277
|
+
} break;
|
6278
|
+
case LLM_ARCH_DEEPSEEK2:
|
6279
|
+
{
|
6280
|
+
bool is_lite = (hparams.n_layer == 27);
|
6281
|
+
|
6282
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
6283
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6284
|
+
const uint32_t q_lora_rank = hparams.n_lora_q;
|
6285
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
6286
|
+
const uint32_t n_ff_exp = hparams.n_ff_exp;
|
6287
|
+
|
6288
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6289
|
+
|
6290
|
+
// output
|
6291
|
+
{
|
6292
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6293
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6294
|
+
}
|
6295
|
+
|
6296
|
+
for (int i = 0; i < n_layer; ++i) {
|
6297
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6298
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6299
|
+
|
6300
|
+
auto & layer = model.layers[i];
|
6301
|
+
|
6302
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6303
|
+
if (!is_lite) {
|
6304
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
6305
|
+
}
|
6306
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
6307
|
+
|
6308
|
+
if (!is_lite) {
|
6309
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
6310
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
|
6311
|
+
} else {
|
6312
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
6313
|
+
}
|
6314
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
6315
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
|
6316
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
|
6317
|
+
|
6318
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6319
|
+
|
6320
|
+
if ((uint32_t) i < hparams.n_layer_dense_lead) {
|
6321
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
6322
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
6323
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6324
|
+
} else {
|
6325
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6326
|
+
|
6327
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
6328
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
6329
|
+
|
6330
|
+
// MoE branch
|
6331
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6332
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
6333
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6334
|
+
|
6335
|
+
// Shared expert branch
|
6336
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6337
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
|
6338
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6339
|
+
}
|
6340
|
+
}
|
6341
|
+
} break;
|
6342
|
+
default:
|
6343
|
+
throw std::runtime_error("unknown architecture");
|
6344
|
+
}
|
6345
|
+
}
|
6346
|
+
|
6347
|
+
ml.done_getting_tensors();
|
6348
|
+
|
6349
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
6350
|
+
model.mappings.reserve(ml.mappings.size());
|
6351
|
+
|
6352
|
+
// create the backend buffers
|
6353
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
6354
|
+
ctx_bufs.reserve(ctx_map.size());
|
6355
|
+
|
6356
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
6357
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
6358
|
+
model.bufs.reserve(n_max_backend_buffer);
|
6359
|
+
|
6360
|
+
for (auto & it : ctx_map) {
|
6361
|
+
ggml_backend_buffer_type_t buft = it.first;
|
6362
|
+
ggml_context * ctx = it.second;
|
6363
|
+
|
6364
|
+
llama_buf_map bufs;
|
6365
|
+
bufs.reserve(n_max_backend_buffer);
|
6087
6366
|
|
6088
6367
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
6089
6368
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
@@ -6324,10 +6603,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6324
6603
|
|
6325
6604
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
6326
6605
|
} else {
|
6327
|
-
|
6328
|
-
GGML_ASSERT(false && "not implemented");
|
6329
|
-
#endif
|
6330
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6606
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6331
6607
|
inpL = lctx.inp_embd;
|
6332
6608
|
ggml_set_input(lctx.inp_embd);
|
6333
6609
|
}
|
@@ -6517,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6517
6793
|
int64_t n_expert_used,
|
6518
6794
|
llm_ffn_op_type type_op,
|
6519
6795
|
bool norm_w,
|
6796
|
+
bool scale_w,
|
6797
|
+
float w_scale,
|
6520
6798
|
const llm_build_cb & cb,
|
6521
6799
|
int il) {
|
6522
6800
|
int64_t n_embd = cur->ne[0];
|
@@ -6548,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6548
6826
|
|
6549
6827
|
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6550
6828
|
}
|
6829
|
+
if (scale_w) {
|
6830
|
+
weights = ggml_scale(ctx, weights, w_scale);
|
6831
|
+
cb(weights, "ffn_moe_weights_scaled", il);
|
6832
|
+
}
|
6551
6833
|
|
6552
6834
|
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6553
6835
|
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
@@ -6652,7 +6934,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6652
6934
|
|
6653
6935
|
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6654
6936
|
|
6655
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6937
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6656
6938
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6657
6939
|
}
|
6658
6940
|
|
@@ -6661,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6661
6943
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6662
6944
|
cb(kq, "kq", il);
|
6663
6945
|
|
6664
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6946
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6665
6947
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6666
6948
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6667
6949
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6886,17 +7168,20 @@ struct llm_build_context {
|
|
6886
7168
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
6887
7169
|
ggml_set_input(lctx.inp_K_shift);
|
6888
7170
|
|
7171
|
+
|
6889
7172
|
for (int il = 0; il < n_layer; ++il) {
|
7173
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
6890
7174
|
struct ggml_tensor * tmp =
|
6891
7175
|
// we rotate only the first n_rot dimensions
|
6892
|
-
|
7176
|
+
ggml_rope_ext_inplace(ctx0,
|
6893
7177
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
6894
7178
|
n_embd_head_k, n_head_kv, n_ctx,
|
6895
7179
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
6896
7180
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6897
7181
|
0),
|
6898
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7182
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6899
7183
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7184
|
+
|
6900
7185
|
cb(tmp, "K_shifted", il);
|
6901
7186
|
ggml_build_forward_expand(gf, tmp);
|
6902
7187
|
}
|
@@ -6999,6 +7284,17 @@ struct llm_build_context {
|
|
6999
7284
|
return lctx.inp_pos;
|
7000
7285
|
}
|
7001
7286
|
|
7287
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
7288
|
+
// choose long/short freq factors based on the context size
|
7289
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7290
|
+
|
7291
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
7292
|
+
return model.layers[il].rope_long;
|
7293
|
+
}
|
7294
|
+
|
7295
|
+
return model.layers[il].rope_short;
|
7296
|
+
}
|
7297
|
+
|
7002
7298
|
struct ggml_tensor * build_inp_out_ids() {
|
7003
7299
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
7004
7300
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
@@ -7106,15 +7402,15 @@ struct llm_build_context {
|
|
7106
7402
|
cb(Vcur, "Vcur", il);
|
7107
7403
|
}
|
7108
7404
|
|
7109
|
-
Qcur =
|
7110
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7405
|
+
Qcur = ggml_rope_ext(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7111
7407
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7112
7408
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7113
7409
|
);
|
7114
7410
|
cb(Qcur, "Qcur", il);
|
7115
7411
|
|
7116
|
-
Kcur =
|
7117
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7412
|
+
Kcur = ggml_rope_ext(
|
7413
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7118
7414
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7119
7415
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7120
7416
|
);
|
@@ -7144,9 +7440,9 @@ struct llm_build_context {
|
|
7144
7440
|
cb(cur, "ffn_norm", il);
|
7145
7441
|
|
7146
7442
|
cur = llm_build_ffn(ctx0, cur,
|
7147
|
-
model.layers[il].ffn_up,
|
7148
|
-
model.layers[il].ffn_gate,
|
7149
|
-
model.layers[il].ffn_down,
|
7443
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7444
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
7445
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7150
7446
|
NULL,
|
7151
7447
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7152
7448
|
cb(cur, "ffn_out", il);
|
@@ -7164,6 +7460,7 @@ struct llm_build_context {
|
|
7164
7460
|
model.layers[il].ffn_down_exps,
|
7165
7461
|
n_expert, n_expert_used,
|
7166
7462
|
LLM_FFN_SILU, true,
|
7463
|
+
false, 0.0,
|
7167
7464
|
cb, il);
|
7168
7465
|
cb(cur, "ffn_moe_out", il);
|
7169
7466
|
}
|
@@ -7236,13 +7533,13 @@ struct llm_build_context {
|
|
7236
7533
|
|
7237
7534
|
switch (model.type) {
|
7238
7535
|
case MODEL_7B:
|
7239
|
-
Qcur =
|
7240
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7536
|
+
Qcur = ggml_rope_ext(
|
7537
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7241
7538
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7242
7539
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7243
7540
|
);
|
7244
|
-
Kcur =
|
7245
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7541
|
+
Kcur = ggml_rope_ext(
|
7542
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7246
7543
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7544
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7545
|
);
|
@@ -7348,15 +7645,15 @@ struct llm_build_context {
|
|
7348
7645
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7349
7646
|
cb(Vcur, "Vcur", il);
|
7350
7647
|
|
7351
|
-
Qcur =
|
7352
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7648
|
+
Qcur = ggml_rope_ext(
|
7649
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7353
7650
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7354
7651
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7355
7652
|
);
|
7356
7653
|
cb(Qcur, "Qcur", il);
|
7357
7654
|
|
7358
|
-
Kcur =
|
7359
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7655
|
+
Kcur = ggml_rope_ext(
|
7656
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7360
7657
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7361
7658
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7362
7659
|
);
|
@@ -7469,14 +7766,14 @@ struct llm_build_context {
|
|
7469
7766
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7470
7767
|
|
7471
7768
|
// using mode = 2 for neox mode
|
7472
|
-
Qcur =
|
7473
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7769
|
+
Qcur = ggml_rope_ext(
|
7770
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7474
7771
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7475
7772
|
);
|
7476
7773
|
cb(Qcur, "Qcur", il);
|
7477
7774
|
|
7478
|
-
Kcur =
|
7479
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7775
|
+
Kcur = ggml_rope_ext(
|
7776
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7480
7777
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7481
7778
|
);
|
7482
7779
|
cb(Kcur, "Kcur", il);
|
@@ -7592,15 +7889,15 @@ struct llm_build_context {
|
|
7592
7889
|
cb(Vcur, "Vcur", il);
|
7593
7890
|
}
|
7594
7891
|
|
7595
|
-
Qcur =
|
7596
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7892
|
+
Qcur = ggml_rope_ext(
|
7893
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7597
7894
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7598
7895
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7599
7896
|
);
|
7600
7897
|
cb(Qcur, "Qcur", il);
|
7601
7898
|
|
7602
|
-
Kcur =
|
7603
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7899
|
+
Kcur = ggml_rope_ext(
|
7900
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7604
7901
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7605
7902
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7606
7903
|
);
|
@@ -7645,6 +7942,7 @@ struct llm_build_context {
|
|
7645
7942
|
model.layers[il].ffn_down_exps,
|
7646
7943
|
n_expert, n_expert_used,
|
7647
7944
|
LLM_FFN_GELU, true,
|
7945
|
+
false, 0.0,
|
7648
7946
|
cb, il);
|
7649
7947
|
cb(cur, "ffn_moe_out", il);
|
7650
7948
|
|
@@ -7744,15 +8042,15 @@ struct llm_build_context {
|
|
7744
8042
|
cb(Kcur, "Kcur", il);
|
7745
8043
|
cb(Vcur, "Vcur", il);
|
7746
8044
|
|
7747
|
-
Qcur =
|
7748
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8045
|
+
Qcur = ggml_rope_ext(
|
8046
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7749
8047
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7750
8048
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7751
8049
|
);
|
7752
8050
|
cb(Qcur, "Qcur", il);
|
7753
8051
|
|
7754
|
-
Kcur =
|
7755
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8052
|
+
Kcur = ggml_rope_ext(
|
8053
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7756
8054
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7757
8055
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7758
8056
|
);
|
@@ -7788,6 +8086,7 @@ struct llm_build_context {
|
|
7788
8086
|
model.layers[il].ffn_down_exps,
|
7789
8087
|
n_expert, n_expert_used,
|
7790
8088
|
LLM_FFN_SILU, true,
|
8089
|
+
false, 0.0,
|
7791
8090
|
cb, il);
|
7792
8091
|
cb(cur, "ffn_moe_out", il);
|
7793
8092
|
|
@@ -7921,213 +8220,6 @@ struct llm_build_context {
|
|
7921
8220
|
return gf;
|
7922
8221
|
}
|
7923
8222
|
|
7924
|
-
struct ggml_cgraph * build_persimmon() {
|
7925
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7926
|
-
|
7927
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7928
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7929
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
7930
|
-
|
7931
|
-
struct ggml_tensor * cur;
|
7932
|
-
struct ggml_tensor * inpL;
|
7933
|
-
|
7934
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7935
|
-
|
7936
|
-
// inp_pos - contains the positions
|
7937
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
7938
|
-
|
7939
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7940
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7941
|
-
|
7942
|
-
for (int il = 0; il < n_layer; ++il) {
|
7943
|
-
struct ggml_tensor * residual = inpL;
|
7944
|
-
|
7945
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
7946
|
-
model.layers[il].attn_norm,
|
7947
|
-
model.layers[il].attn_norm_b,
|
7948
|
-
LLM_NORM, cb, il);
|
7949
|
-
cb(cur, "attn_norm", il);
|
7950
|
-
|
7951
|
-
// self attention
|
7952
|
-
{
|
7953
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7954
|
-
cb(cur, "wqkv", il);
|
7955
|
-
|
7956
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
7957
|
-
cb(cur, "bqkv", il);
|
7958
|
-
|
7959
|
-
// split qkv
|
7960
|
-
GGML_ASSERT(n_head_kv == n_head);
|
7961
|
-
|
7962
|
-
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
7963
|
-
cb(tmpqkv, "tmpqkv", il);
|
7964
|
-
|
7965
|
-
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
7966
|
-
cb(tmpqkv_perm, "tmpqkv", il);
|
7967
|
-
|
7968
|
-
struct ggml_tensor * tmpq = ggml_view_3d(
|
7969
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
7970
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
7971
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
7972
|
-
0
|
7973
|
-
);
|
7974
|
-
cb(tmpq, "tmpq", il);
|
7975
|
-
|
7976
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
7977
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
7978
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
7979
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
7980
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
7981
|
-
);
|
7982
|
-
cb(tmpk, "tmpk", il);
|
7983
|
-
|
7984
|
-
// Q/K Layernorm
|
7985
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
7986
|
-
model.layers[il].attn_q_norm,
|
7987
|
-
model.layers[il].attn_q_norm_b,
|
7988
|
-
LLM_NORM, cb, il);
|
7989
|
-
cb(tmpq, "tmpq", il);
|
7990
|
-
|
7991
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
7992
|
-
model.layers[il].attn_k_norm,
|
7993
|
-
model.layers[il].attn_k_norm_b,
|
7994
|
-
LLM_NORM, cb, il);
|
7995
|
-
cb(tmpk, "tmpk", il);
|
7996
|
-
|
7997
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
7998
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
7999
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8000
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8001
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8002
|
-
0
|
8003
|
-
);
|
8004
|
-
cb(qrot, "qrot", il);
|
8005
|
-
|
8006
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
8007
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8008
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8009
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8010
|
-
0
|
8011
|
-
);
|
8012
|
-
cb(krot, "krot", il);
|
8013
|
-
|
8014
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
8015
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
8016
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8017
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8018
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8019
|
-
ggml_element_size(tmpq) * n_rot
|
8020
|
-
);
|
8021
|
-
cb(qpass, "qpass", il);
|
8022
|
-
|
8023
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
8024
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8025
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8026
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8027
|
-
ggml_element_size(tmpk) * n_rot
|
8028
|
-
);
|
8029
|
-
cb(kpass, "kpass", il);
|
8030
|
-
|
8031
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
8032
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8033
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8034
|
-
);
|
8035
|
-
cb(qrotated, "qrotated", il);
|
8036
|
-
|
8037
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
8038
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8039
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8040
|
-
);
|
8041
|
-
cb(krotated, "krotated", il);
|
8042
|
-
|
8043
|
-
// ggml currently only supports concatenation on dim=2
|
8044
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
8045
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
8046
|
-
cb(qrotated, "qrotated", il);
|
8047
|
-
|
8048
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
8049
|
-
cb(krotated, "krotated", il);
|
8050
|
-
|
8051
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
8052
|
-
cb(qpass, "qpass", il);
|
8053
|
-
|
8054
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
8055
|
-
cb(kpass, "kpass", il);
|
8056
|
-
|
8057
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
8058
|
-
cb(Qcur, "Qcur", il);
|
8059
|
-
|
8060
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
8061
|
-
cb(Kcur, "Kcur", il);
|
8062
|
-
|
8063
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
8064
|
-
cb(Q, "Q", il);
|
8065
|
-
|
8066
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
8067
|
-
cb(Kcur, "Kcur", il);
|
8068
|
-
|
8069
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
8070
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
8071
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
8072
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
8073
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
8074
|
-
);
|
8075
|
-
cb(Vcur, "Vcur", il);
|
8076
|
-
|
8077
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8078
|
-
model.layers[il].wo, model.layers[il].bo,
|
8079
|
-
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8080
|
-
}
|
8081
|
-
|
8082
|
-
if (il == n_layer - 1) {
|
8083
|
-
// skip computing output for unused tokens
|
8084
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8085
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8086
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8087
|
-
}
|
8088
|
-
|
8089
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
8090
|
-
cb(ffn_inp, "ffn_inp", il);
|
8091
|
-
|
8092
|
-
// feed-forward network
|
8093
|
-
{
|
8094
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8095
|
-
model.layers[il].ffn_norm,
|
8096
|
-
model.layers[il].ffn_norm_b,
|
8097
|
-
LLM_NORM, cb, il);
|
8098
|
-
cb(cur, "ffn_norm", il);
|
8099
|
-
|
8100
|
-
cur = llm_build_ffn(ctx0, cur,
|
8101
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8102
|
-
NULL, NULL,
|
8103
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8104
|
-
NULL,
|
8105
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
8106
|
-
cb(cur, "ffn_out", il);
|
8107
|
-
}
|
8108
|
-
|
8109
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
8110
|
-
cb(cur, "l_out", il);
|
8111
|
-
|
8112
|
-
inpL = cur;
|
8113
|
-
}
|
8114
|
-
|
8115
|
-
cur = inpL;
|
8116
|
-
|
8117
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
8118
|
-
model.output_norm,
|
8119
|
-
model.output_norm_b,
|
8120
|
-
LLM_NORM, cb, -1);
|
8121
|
-
cb(cur, "result_norm", -1);
|
8122
|
-
|
8123
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8124
|
-
cb(cur, "result_output", -1);
|
8125
|
-
|
8126
|
-
ggml_build_forward_expand(gf, cur);
|
8127
|
-
|
8128
|
-
return gf;
|
8129
|
-
}
|
8130
|
-
|
8131
8223
|
struct ggml_cgraph * build_refact() {
|
8132
8224
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8133
8225
|
|
@@ -8304,15 +8396,15 @@ struct llm_build_context {
|
|
8304
8396
|
cb(Kcur, "Kcur", il);
|
8305
8397
|
cb(Vcur, "Vcur", il);
|
8306
8398
|
|
8307
|
-
Qcur =
|
8308
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8399
|
+
Qcur = ggml_rope_ext(
|
8400
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8309
8401
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8310
8402
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8311
8403
|
);
|
8312
8404
|
cb(Qcur, "Qcur", il);
|
8313
8405
|
|
8314
|
-
Kcur =
|
8315
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8406
|
+
Kcur = ggml_rope_ext(
|
8407
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8316
8408
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8317
8409
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8318
8410
|
);
|
@@ -8744,15 +8836,15 @@ struct llm_build_context {
|
|
8744
8836
|
}
|
8745
8837
|
|
8746
8838
|
|
8747
|
-
Qcur =
|
8748
|
-
ctx0, Qcur, inp_pos,
|
8839
|
+
Qcur = ggml_rope_ext(
|
8840
|
+
ctx0, Qcur, inp_pos, nullptr,
|
8749
8841
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8750
8842
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8751
8843
|
);
|
8752
8844
|
cb(Qcur, "Qcur", il);
|
8753
8845
|
|
8754
|
-
Kcur =
|
8755
|
-
ctx0, Kcur, inp_pos,
|
8846
|
+
Kcur = ggml_rope_ext(
|
8847
|
+
ctx0, Kcur, inp_pos, nullptr,
|
8756
8848
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8757
8849
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8758
8850
|
);
|
@@ -8864,14 +8956,14 @@ struct llm_build_context {
|
|
8864
8956
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8865
8957
|
|
8866
8958
|
// using mode = 2 for neox mode
|
8867
|
-
Qcur =
|
8868
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8959
|
+
Qcur = ggml_rope_ext(
|
8960
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8869
8961
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8870
8962
|
);
|
8871
8963
|
cb(Qcur, "Qcur", il);
|
8872
8964
|
|
8873
|
-
Kcur =
|
8874
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8965
|
+
Kcur = ggml_rope_ext(
|
8966
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8875
8967
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8876
8968
|
);
|
8877
8969
|
cb(Kcur, "Kcur", il);
|
@@ -8975,15 +9067,15 @@ struct llm_build_context {
|
|
8975
9067
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8976
9068
|
cb(Vcur, "Vcur", il);
|
8977
9069
|
|
8978
|
-
Qcur =
|
8979
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9070
|
+
Qcur = ggml_rope_ext(
|
9071
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8980
9072
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8981
9073
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8982
9074
|
);
|
8983
9075
|
cb(Qcur, "Qcur", il);
|
8984
9076
|
|
8985
|
-
Kcur =
|
8986
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9077
|
+
Kcur = ggml_rope_ext(
|
9078
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8987
9079
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8988
9080
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8989
9081
|
);
|
@@ -9089,15 +9181,15 @@ struct llm_build_context {
|
|
9089
9181
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9090
9182
|
cb(Vcur, "Vcur", il);
|
9091
9183
|
|
9092
|
-
Qcur =
|
9093
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9184
|
+
Qcur = ggml_rope_ext(
|
9185
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9094
9186
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9095
9187
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9096
9188
|
);
|
9097
9189
|
cb(Qcur, "Qcur", il);
|
9098
9190
|
|
9099
|
-
Kcur =
|
9100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9191
|
+
Kcur = ggml_rope_ext(
|
9192
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9101
9193
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9102
9194
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9103
9195
|
);
|
@@ -9133,6 +9225,7 @@ struct llm_build_context {
|
|
9133
9225
|
model.layers[il].ffn_down_exps,
|
9134
9226
|
n_expert, n_expert_used,
|
9135
9227
|
LLM_FFN_SILU, false,
|
9228
|
+
false, 0.0,
|
9136
9229
|
cb, il);
|
9137
9230
|
cb(cur, "ffn_moe_out", il);
|
9138
9231
|
|
@@ -9241,8 +9334,8 @@ struct llm_build_context {
|
|
9241
9334
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9242
9335
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9243
9336
|
|
9244
|
-
Qcur =
|
9245
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9337
|
+
Qcur = ggml_rope_ext(
|
9338
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9246
9339
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9247
9340
|
);
|
9248
9341
|
cb(Qcur, "Qcur", il);
|
@@ -9252,8 +9345,8 @@ struct llm_build_context {
|
|
9252
9345
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9253
9346
|
cb(Qcur, "Qcur", il);
|
9254
9347
|
|
9255
|
-
Kcur =
|
9256
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9348
|
+
Kcur = ggml_rope_ext(
|
9349
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9257
9350
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9258
9351
|
);
|
9259
9352
|
cb(Kcur, "Kcur", il);
|
@@ -9329,6 +9422,9 @@ struct llm_build_context {
|
|
9329
9422
|
|
9330
9423
|
// self-attention
|
9331
9424
|
{
|
9425
|
+
// rope freq factors for 128k context
|
9426
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
9427
|
+
|
9332
9428
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9333
9429
|
model.layers[il].attn_norm,
|
9334
9430
|
NULL,
|
@@ -9360,8 +9456,8 @@ struct llm_build_context {
|
|
9360
9456
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9361
9457
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9362
9458
|
|
9363
|
-
Qcur =
|
9364
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9459
|
+
Qcur = ggml_rope_ext(
|
9460
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9365
9461
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9366
9462
|
);
|
9367
9463
|
cb(Qcur, "Qcur", il);
|
@@ -9369,8 +9465,8 @@ struct llm_build_context {
|
|
9369
9465
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9370
9466
|
cb(Qcur, "Qcur", il);
|
9371
9467
|
|
9372
|
-
Kcur =
|
9373
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9468
|
+
Kcur = ggml_rope_ext(
|
9469
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9374
9470
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9375
9471
|
);
|
9376
9472
|
cb(Kcur, "Kcur", il);
|
@@ -9476,14 +9572,14 @@ struct llm_build_context {
|
|
9476
9572
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9477
9573
|
cb(Vcur, "Vcur", il);
|
9478
9574
|
|
9479
|
-
Qcur =
|
9480
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
9575
|
+
Qcur = ggml_rope_ext(
|
9576
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9481
9577
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9482
9578
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9483
9579
|
cb(Qcur, "Qcur", il);
|
9484
9580
|
|
9485
|
-
Kcur =
|
9486
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
9581
|
+
Kcur = ggml_rope_ext(
|
9582
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9487
9583
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9488
9584
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9489
9585
|
cb(Kcur, "Kcur", il);
|
@@ -9684,15 +9780,15 @@ struct llm_build_context {
|
|
9684
9780
|
cb(tmpk, "tmpk", il);
|
9685
9781
|
cb(Vcur, "Vcur", il);
|
9686
9782
|
|
9687
|
-
struct ggml_tensor * Qcur =
|
9688
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
9783
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9784
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9689
9785
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9690
9786
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9691
9787
|
);
|
9692
9788
|
cb(Qcur, "Qcur", il);
|
9693
9789
|
|
9694
|
-
struct ggml_tensor * Kcur =
|
9695
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9790
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9791
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9696
9792
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9697
9793
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9698
9794
|
);
|
@@ -9800,15 +9896,15 @@ struct llm_build_context {
|
|
9800
9896
|
// cb(Vcur, "Vcur", il);
|
9801
9897
|
// }
|
9802
9898
|
|
9803
|
-
Qcur =
|
9804
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9899
|
+
Qcur = ggml_rope_ext(
|
9900
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9805
9901
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9806
9902
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9807
9903
|
);
|
9808
9904
|
cb(Qcur, "Qcur", il);
|
9809
9905
|
|
9810
|
-
Kcur =
|
9811
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9906
|
+
Kcur = ggml_rope_ext(
|
9907
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9812
9908
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9813
9909
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9814
9910
|
);
|
@@ -9917,15 +10013,15 @@ struct llm_build_context {
|
|
9917
10013
|
cb(Vcur, "Vcur", il);
|
9918
10014
|
}
|
9919
10015
|
|
9920
|
-
Qcur =
|
9921
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10016
|
+
Qcur = ggml_rope_ext(
|
10017
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9922
10018
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9923
10019
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9924
10020
|
);
|
9925
10021
|
cb(Qcur, "Qcur", il);
|
9926
10022
|
|
9927
|
-
Kcur =
|
9928
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10023
|
+
Kcur = ggml_rope_ext(
|
10024
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9929
10025
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9930
10026
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9931
10027
|
);
|
@@ -10047,15 +10143,15 @@ struct llm_build_context {
|
|
10047
10143
|
cb(Vcur, "Vcur", il);
|
10048
10144
|
}
|
10049
10145
|
|
10050
|
-
Qcur =
|
10051
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10146
|
+
Qcur = ggml_rope_ext(
|
10147
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10052
10148
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10053
10149
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10054
10150
|
);
|
10055
10151
|
cb(Qcur, "Qcur", il);
|
10056
10152
|
|
10057
|
-
Kcur =
|
10058
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10153
|
+
Kcur = ggml_rope_ext(
|
10154
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10059
10155
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10060
10156
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10061
10157
|
);
|
@@ -10167,8 +10263,8 @@ struct llm_build_context {
|
|
10167
10263
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10168
10264
|
cb(Vcur, "Vcur", il);
|
10169
10265
|
|
10170
|
-
Qcur =
|
10171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
10266
|
+
Qcur = ggml_rope_ext(
|
10267
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10172
10268
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10173
10269
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10174
10270
|
cb(Qcur, "Qcur", il);
|
@@ -10176,8 +10272,8 @@ struct llm_build_context {
|
|
10176
10272
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
10177
10273
|
cb(Qcur, "Qcur_scaled", il);
|
10178
10274
|
|
10179
|
-
Kcur =
|
10180
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
10275
|
+
Kcur = ggml_rope_ext(
|
10276
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10181
10277
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10182
10278
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10183
10279
|
cb(Kcur, "Kcur", il);
|
@@ -10287,15 +10383,15 @@ struct llm_build_context {
|
|
10287
10383
|
cb(Vcur, "Vcur", il);
|
10288
10384
|
}
|
10289
10385
|
|
10290
|
-
Qcur =
|
10291
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10386
|
+
Qcur = ggml_rope_ext(
|
10387
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10292
10388
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10293
10389
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10294
10390
|
);
|
10295
10391
|
cb(Qcur, "Qcur", il);
|
10296
10392
|
|
10297
|
-
Kcur =
|
10298
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10393
|
+
Kcur = ggml_rope_ext(
|
10394
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10299
10395
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10300
10396
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10301
10397
|
);
|
@@ -10577,15 +10673,15 @@ struct llm_build_context {
|
|
10577
10673
|
cb(Kcur, "Kcur", il);
|
10578
10674
|
}
|
10579
10675
|
|
10580
|
-
Qcur =
|
10581
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10676
|
+
Qcur = ggml_rope_ext(
|
10677
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10582
10678
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10583
10679
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10584
10680
|
);
|
10585
10681
|
cb(Qcur, "Qcur", il);
|
10586
10682
|
|
10587
|
-
Kcur =
|
10588
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10683
|
+
Kcur = ggml_rope_ext(
|
10684
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10589
10685
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10590
10686
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10591
10687
|
);
|
@@ -10680,8 +10776,269 @@ struct llm_build_context {
|
|
10680
10776
|
|
10681
10777
|
// norm
|
10682
10778
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
10683
|
-
NULL, NULL,
|
10684
|
-
LLM_NORM, cb, il);
|
10779
|
+
NULL, NULL,
|
10780
|
+
LLM_NORM, cb, il);
|
10781
|
+
cb(cur, "attn_norm", il);
|
10782
|
+
|
10783
|
+
// self-attention
|
10784
|
+
{
|
10785
|
+
// compute Q and K and RoPE them
|
10786
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10787
|
+
cb(Qcur, "Qcur", il);
|
10788
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10789
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10790
|
+
cb(Qcur, "Qcur", il);
|
10791
|
+
}
|
10792
|
+
|
10793
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10794
|
+
cb(Kcur, "Kcur", il);
|
10795
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10796
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10797
|
+
cb(Kcur, "Kcur", il);
|
10798
|
+
}
|
10799
|
+
|
10800
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10801
|
+
cb(Vcur, "Vcur", il);
|
10802
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10803
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10804
|
+
cb(Vcur, "Vcur", il);
|
10805
|
+
}
|
10806
|
+
|
10807
|
+
Qcur = ggml_rope_ext(
|
10808
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10809
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10810
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10811
|
+
);
|
10812
|
+
cb(Qcur, "Qcur", il);
|
10813
|
+
|
10814
|
+
Kcur = ggml_rope_ext(
|
10815
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10816
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10817
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10818
|
+
);
|
10819
|
+
cb(Kcur, "Kcur", il);
|
10820
|
+
|
10821
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10822
|
+
model.layers[il].wo, nullptr,
|
10823
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10824
|
+
}
|
10825
|
+
|
10826
|
+
if (il == n_layer - 1) {
|
10827
|
+
// skip computing output for unused tokens
|
10828
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10829
|
+
n_tokens = n_outputs;
|
10830
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10831
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10832
|
+
}
|
10833
|
+
|
10834
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10835
|
+
cb(ffn_inp, "ffn_inp", il);
|
10836
|
+
|
10837
|
+
// feed-forward network
|
10838
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10839
|
+
NULL, NULL,
|
10840
|
+
LLM_NORM, cb, il);
|
10841
|
+
cb(cur, "ffn_norm", il);
|
10842
|
+
|
10843
|
+
cur = llm_build_ffn(ctx0, cur,
|
10844
|
+
model.layers[il].ffn_up, NULL,
|
10845
|
+
model.layers[il].ffn_gate, NULL,
|
10846
|
+
model.layers[il].ffn_down, NULL,
|
10847
|
+
NULL,
|
10848
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10849
|
+
cb(cur, "ffn_out", il);
|
10850
|
+
|
10851
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10852
|
+
cb(cur, "ffn_out", il);
|
10853
|
+
|
10854
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10855
|
+
if (layer_dir != nullptr) {
|
10856
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10857
|
+
}
|
10858
|
+
cb(cur, "l_out", il);
|
10859
|
+
|
10860
|
+
// input for next layer
|
10861
|
+
inpL = cur;
|
10862
|
+
}
|
10863
|
+
|
10864
|
+
cur = inpL;
|
10865
|
+
|
10866
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10867
|
+
NULL, NULL,
|
10868
|
+
LLM_NORM, cb, -1);
|
10869
|
+
cb(cur, "result_norm", -1);
|
10870
|
+
|
10871
|
+
// lm_head
|
10872
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10873
|
+
cb(cur, "result_output", -1);
|
10874
|
+
|
10875
|
+
ggml_build_forward_expand(gf, cur);
|
10876
|
+
|
10877
|
+
return gf;
|
10878
|
+
}
|
10879
|
+
|
10880
|
+
struct ggml_cgraph * build_gptneox() {
|
10881
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10882
|
+
|
10883
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10884
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
10885
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10886
|
+
|
10887
|
+
struct ggml_tensor * cur;
|
10888
|
+
struct ggml_tensor * inpL;
|
10889
|
+
|
10890
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10891
|
+
|
10892
|
+
// inp_pos - contains the positions
|
10893
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10894
|
+
|
10895
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10896
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10897
|
+
|
10898
|
+
for (int il = 0; il < n_layer; ++il) {
|
10899
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10900
|
+
model.layers[il].attn_norm,
|
10901
|
+
model.layers[il].attn_norm_b,
|
10902
|
+
LLM_NORM, cb, il);
|
10903
|
+
cb(cur, "attn_norm", il);
|
10904
|
+
|
10905
|
+
// self-attention
|
10906
|
+
{
|
10907
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
10908
|
+
cb(cur, "wqkv", il);
|
10909
|
+
|
10910
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
10911
|
+
cb(cur, "bqkv", il);
|
10912
|
+
|
10913
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
10914
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10915
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10916
|
+
|
10917
|
+
cb(Qcur, "Qcur", il);
|
10918
|
+
cb(Kcur, "Kcur", il);
|
10919
|
+
cb(Vcur, "Vcur", il);
|
10920
|
+
|
10921
|
+
Qcur = ggml_rope_ext(
|
10922
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10923
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10924
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10925
|
+
);
|
10926
|
+
cb(Qcur, "Qcur", il);
|
10927
|
+
|
10928
|
+
Kcur = ggml_rope_ext(
|
10929
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10930
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10931
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10932
|
+
);
|
10933
|
+
cb(Kcur, "Kcur", il);
|
10934
|
+
|
10935
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10936
|
+
model.layers[il].wo, model.layers[il].bo,
|
10937
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10938
|
+
}
|
10939
|
+
|
10940
|
+
if (il == n_layer - 1) {
|
10941
|
+
// skip computing output for unused tokens
|
10942
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10943
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10944
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10945
|
+
}
|
10946
|
+
|
10947
|
+
// ffn
|
10948
|
+
if (hparams.use_par_res) {
|
10949
|
+
// attention and ffn are computed in parallel
|
10950
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
10951
|
+
|
10952
|
+
struct ggml_tensor * attn_out = cur;
|
10953
|
+
|
10954
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10955
|
+
model.layers[il].ffn_norm,
|
10956
|
+
model.layers[il].ffn_norm_b,
|
10957
|
+
LLM_NORM, cb, il);
|
10958
|
+
cb(cur, "ffn_norm", il);
|
10959
|
+
|
10960
|
+
cur = llm_build_ffn(ctx0, cur,
|
10961
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10962
|
+
NULL, NULL,
|
10963
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10964
|
+
NULL,
|
10965
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10966
|
+
cb(cur, "ffn_out", il);
|
10967
|
+
|
10968
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10969
|
+
cb(cur, "ffn_out", il);
|
10970
|
+
|
10971
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
10972
|
+
cb(inpL, "l_out", il);
|
10973
|
+
} else {
|
10974
|
+
// attention and ffn are computed sequentially
|
10975
|
+
// x = x + attn(ln1(x))
|
10976
|
+
// x = x + ffn(ln2(x))
|
10977
|
+
|
10978
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
10979
|
+
cb(ffn_inp, "ffn_inp", il);
|
10980
|
+
|
10981
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10982
|
+
model.layers[il].ffn_norm,
|
10983
|
+
model.layers[il].ffn_norm_b,
|
10984
|
+
LLM_NORM, cb, il);
|
10985
|
+
cb(cur, "ffn_norm", il);
|
10986
|
+
|
10987
|
+
cur = llm_build_ffn(ctx0, cur,
|
10988
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10989
|
+
NULL, NULL,
|
10990
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10991
|
+
NULL,
|
10992
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10993
|
+
cb(cur, "ffn_out", il);
|
10994
|
+
|
10995
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
10996
|
+
cb(inpL, "l_out", il);
|
10997
|
+
}
|
10998
|
+
}
|
10999
|
+
|
11000
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11001
|
+
model.output_norm,
|
11002
|
+
model.output_norm_b,
|
11003
|
+
LLM_NORM, cb, -1);
|
11004
|
+
cb(cur, "result_norm", -1);
|
11005
|
+
|
11006
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11007
|
+
cb(cur, "result_output", -1);
|
11008
|
+
|
11009
|
+
ggml_build_forward_expand(gf, cur);
|
11010
|
+
|
11011
|
+
return gf;
|
11012
|
+
}
|
11013
|
+
|
11014
|
+
struct ggml_cgraph * build_arctic() {
|
11015
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11016
|
+
|
11017
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11018
|
+
int32_t n_tokens = this->n_tokens;
|
11019
|
+
|
11020
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
11021
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
11022
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
11023
|
+
|
11024
|
+
struct ggml_tensor * cur;
|
11025
|
+
struct ggml_tensor * inpL;
|
11026
|
+
|
11027
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11028
|
+
|
11029
|
+
// inp_pos - contains the positions
|
11030
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11031
|
+
|
11032
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11033
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11034
|
+
|
11035
|
+
for (int il = 0; il < n_layer; ++il) {
|
11036
|
+
struct ggml_tensor * inpSA = inpL;
|
11037
|
+
|
11038
|
+
// norm
|
11039
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11040
|
+
model.layers[il].attn_norm, NULL,
|
11041
|
+
LLM_NORM_RMS, cb, il);
|
10685
11042
|
cb(cur, "attn_norm", il);
|
10686
11043
|
|
10687
11044
|
// self-attention
|
@@ -10689,41 +11046,29 @@ struct llm_build_context {
|
|
10689
11046
|
// compute Q and K and RoPE them
|
10690
11047
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10691
11048
|
cb(Qcur, "Qcur", il);
|
10692
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10693
|
-
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10694
|
-
cb(Qcur, "Qcur", il);
|
10695
|
-
}
|
10696
11049
|
|
10697
11050
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10698
11051
|
cb(Kcur, "Kcur", il);
|
10699
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10700
|
-
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10701
|
-
cb(Kcur, "Kcur", il);
|
10702
|
-
}
|
10703
11052
|
|
10704
11053
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10705
11054
|
cb(Vcur, "Vcur", il);
|
10706
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
10707
|
-
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10708
|
-
cb(Vcur, "Vcur", il);
|
10709
|
-
}
|
10710
11055
|
|
10711
|
-
Qcur =
|
10712
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
11056
|
+
Qcur = ggml_rope_ext(
|
11057
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10713
11058
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10714
11059
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10715
11060
|
);
|
10716
11061
|
cb(Qcur, "Qcur", il);
|
10717
11062
|
|
10718
|
-
Kcur =
|
10719
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
11063
|
+
Kcur = ggml_rope_ext(
|
11064
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10720
11065
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10721
11066
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10722
11067
|
);
|
10723
11068
|
cb(Kcur, "Kcur", il);
|
10724
11069
|
|
10725
11070
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10726
|
-
model.layers[il].wo,
|
11071
|
+
model.layers[il].wo, NULL,
|
10727
11072
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10728
11073
|
}
|
10729
11074
|
|
@@ -10740,8 +11085,8 @@ struct llm_build_context {
|
|
10740
11085
|
|
10741
11086
|
// feed-forward network
|
10742
11087
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10743
|
-
|
10744
|
-
|
11088
|
+
model.layers[il].ffn_norm, NULL,
|
11089
|
+
LLM_NORM_RMS, cb, il);
|
10745
11090
|
cb(cur, "ffn_norm", il);
|
10746
11091
|
|
10747
11092
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -10752,7 +11097,27 @@ struct llm_build_context {
|
|
10752
11097
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10753
11098
|
cb(cur, "ffn_out", il);
|
10754
11099
|
|
10755
|
-
|
11100
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
11101
|
+
cb(ffn_out, "ffn_out", il);
|
11102
|
+
|
11103
|
+
// MoE
|
11104
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
11105
|
+
model.layers[il].ffn_norm_exps, NULL,
|
11106
|
+
LLM_NORM_RMS, cb, il);
|
11107
|
+
cb(cur, "ffn_norm_exps", il);
|
11108
|
+
|
11109
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
11110
|
+
model.layers[il].ffn_gate_inp,
|
11111
|
+
model.layers[il].ffn_up_exps,
|
11112
|
+
model.layers[il].ffn_gate_exps,
|
11113
|
+
model.layers[il].ffn_down_exps,
|
11114
|
+
n_expert, n_expert_used,
|
11115
|
+
LLM_FFN_SILU, true,
|
11116
|
+
false, 0.0,
|
11117
|
+
cb, il);
|
11118
|
+
cb(cur, "ffn_moe_out", il);
|
11119
|
+
|
11120
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
10756
11121
|
cb(cur, "ffn_out", il);
|
10757
11122
|
|
10758
11123
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
@@ -10768,8 +11133,240 @@ struct llm_build_context {
|
|
10768
11133
|
cur = inpL;
|
10769
11134
|
|
10770
11135
|
cur = llm_build_norm(ctx0, cur, hparams,
|
10771
|
-
|
10772
|
-
|
11136
|
+
model.output_norm, NULL,
|
11137
|
+
LLM_NORM_RMS, cb, -1);
|
11138
|
+
cb(cur, "result_norm", -1);
|
11139
|
+
|
11140
|
+
// lm_head
|
11141
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11142
|
+
cb(cur, "result_output", -1);
|
11143
|
+
|
11144
|
+
ggml_build_forward_expand(gf, cur);
|
11145
|
+
|
11146
|
+
return gf;
|
11147
|
+
}
|
11148
|
+
|
11149
|
+
struct ggml_cgraph * build_deepseek2() {
|
11150
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11151
|
+
|
11152
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11153
|
+
int32_t n_tokens = this->n_tokens;
|
11154
|
+
|
11155
|
+
bool is_lite = (hparams.n_layer == 27);
|
11156
|
+
|
11157
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
11158
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
11159
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
11160
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
|
11161
|
+
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
11162
|
+
|
11163
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
11164
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
11165
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
11166
|
+
|
11167
|
+
struct ggml_tensor * cur;
|
11168
|
+
struct ggml_tensor * inpL;
|
11169
|
+
|
11170
|
+
// {n_embd, n_tokens}
|
11171
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11172
|
+
|
11173
|
+
// inp_pos - contains the positions
|
11174
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11175
|
+
|
11176
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11177
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11178
|
+
|
11179
|
+
for (int il = 0; il < n_layer; ++il) {
|
11180
|
+
struct ggml_tensor * inpSA = inpL;
|
11181
|
+
|
11182
|
+
// norm
|
11183
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11184
|
+
model.layers[il].attn_norm, NULL,
|
11185
|
+
LLM_NORM_RMS, cb, il);
|
11186
|
+
cb(cur, "attn_norm", il);
|
11187
|
+
|
11188
|
+
// self_attention
|
11189
|
+
{
|
11190
|
+
struct ggml_tensor * q = NULL;
|
11191
|
+
if (!is_lite) {
|
11192
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
11193
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
11194
|
+
cb(q, "q", il);
|
11195
|
+
|
11196
|
+
q = llm_build_norm(ctx0, q, hparams,
|
11197
|
+
model.layers[il].attn_q_a_norm, NULL,
|
11198
|
+
LLM_NORM_RMS, cb, il);
|
11199
|
+
cb(q, "q", il);
|
11200
|
+
|
11201
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
11202
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
11203
|
+
cb(q, "q", il);
|
11204
|
+
} else {
|
11205
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
11206
|
+
cb(q, "q", il);
|
11207
|
+
}
|
11208
|
+
|
11209
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11210
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
11211
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11212
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11213
|
+
0);
|
11214
|
+
cb(q_nope, "q_nope", il);
|
11215
|
+
|
11216
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
11217
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
11218
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11219
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11220
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
11221
|
+
cb(q_pe, "q_pe", il);
|
11222
|
+
|
11223
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
11224
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
11225
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
11226
|
+
|
11227
|
+
// split into {kv_lora_rank, n_tokens}
|
11228
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
11229
|
+
kv_pe_compresseed->nb[1],
|
11230
|
+
0);
|
11231
|
+
cb(kv_compressed, "kv_compressed", il);
|
11232
|
+
|
11233
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
11234
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
11235
|
+
kv_pe_compresseed->nb[1],
|
11236
|
+
kv_pe_compresseed->nb[1],
|
11237
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
11238
|
+
cb(k_pe, "k_pe", il);
|
11239
|
+
|
11240
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
11241
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
11242
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
11243
|
+
LLM_NORM_RMS, cb, il);
|
11244
|
+
cb(kv_compressed, "kv_compressed", il);
|
11245
|
+
|
11246
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
11247
|
+
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
11248
|
+
cb(kv, "kv", il);
|
11249
|
+
|
11250
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11251
|
+
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
11252
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
11253
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11254
|
+
0);
|
11255
|
+
cb(k_nope, "k_nope", il);
|
11256
|
+
|
11257
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
11258
|
+
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
11259
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11260
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
11261
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
11262
|
+
cb(v_states, "v_states", il);
|
11263
|
+
|
11264
|
+
v_states = ggml_cont(ctx0, v_states);
|
11265
|
+
cb(v_states, "v_states", il);
|
11266
|
+
|
11267
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
11268
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
11269
|
+
0);
|
11270
|
+
cb(v_states, "v_states", il);
|
11271
|
+
|
11272
|
+
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11273
|
+
q_pe = ggml_rope_ext(
|
11274
|
+
ctx0, q_pe, inp_pos, nullptr,
|
11275
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11276
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11277
|
+
);
|
11278
|
+
cb(q_pe, "q_pe", il);
|
11279
|
+
|
11280
|
+
// shared RoPE key
|
11281
|
+
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11282
|
+
k_pe = ggml_rope_ext(
|
11283
|
+
ctx0, k_pe, inp_pos, nullptr,
|
11284
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11285
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11286
|
+
);
|
11287
|
+
cb(k_pe, "k_pe", il);
|
11288
|
+
|
11289
|
+
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
11290
|
+
cb(q_states, "q_states", il);
|
11291
|
+
|
11292
|
+
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
11293
|
+
cb(k_states, "k_states", il);
|
11294
|
+
|
11295
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
11296
|
+
model.layers[il].wo, NULL,
|
11297
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11298
|
+
}
|
11299
|
+
|
11300
|
+
if (il == n_layer - 1) {
|
11301
|
+
// skip computing output for unused tokens
|
11302
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11303
|
+
n_tokens = n_outputs;
|
11304
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
11305
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11306
|
+
}
|
11307
|
+
|
11308
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
11309
|
+
cb(ffn_inp, "ffn_inp", il);
|
11310
|
+
|
11311
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
11312
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11313
|
+
model.layers[il].ffn_norm, NULL,
|
11314
|
+
LLM_NORM_RMS, cb, il);
|
11315
|
+
cb(cur, "ffn_norm", il);
|
11316
|
+
|
11317
|
+
cur = llm_build_ffn(ctx0, cur,
|
11318
|
+
model.layers[il].ffn_up, NULL,
|
11319
|
+
model.layers[il].ffn_gate, NULL,
|
11320
|
+
model.layers[il].ffn_down, NULL,
|
11321
|
+
NULL,
|
11322
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11323
|
+
cb(cur, "ffn_out", il);
|
11324
|
+
} else {
|
11325
|
+
// MoE branch
|
11326
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11327
|
+
model.layers[il].ffn_norm, NULL,
|
11328
|
+
LLM_NORM_RMS, cb, il);
|
11329
|
+
cb(cur, "ffn_norm", il);
|
11330
|
+
|
11331
|
+
ggml_tensor * moe_out =
|
11332
|
+
llm_build_moe_ffn(ctx0, cur,
|
11333
|
+
model.layers[il].ffn_gate_inp,
|
11334
|
+
model.layers[il].ffn_up_exps,
|
11335
|
+
model.layers[il].ffn_gate_exps,
|
11336
|
+
model.layers[il].ffn_down_exps,
|
11337
|
+
n_expert, n_expert_used,
|
11338
|
+
LLM_FFN_SILU, false,
|
11339
|
+
true, hparams.expert_weights_scale,
|
11340
|
+
cb, il);
|
11341
|
+
cb(moe_out, "ffn_moe_out", il);
|
11342
|
+
|
11343
|
+
// FFN shared expert
|
11344
|
+
{
|
11345
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
11346
|
+
model.layers[il].ffn_up_shexp, NULL,
|
11347
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
11348
|
+
model.layers[il].ffn_down_shexp, NULL,
|
11349
|
+
NULL,
|
11350
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11351
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
11352
|
+
|
11353
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
11354
|
+
cb(cur, "ffn_out", il);
|
11355
|
+
}
|
11356
|
+
}
|
11357
|
+
|
11358
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
11359
|
+
cb(cur, "l_out", il);
|
11360
|
+
|
11361
|
+
// input for next layer
|
11362
|
+
inpL = cur;
|
11363
|
+
}
|
11364
|
+
|
11365
|
+
cur = inpL;
|
11366
|
+
|
11367
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11368
|
+
model.output_norm, NULL,
|
11369
|
+
LLM_NORM_RMS, cb, -1);
|
10773
11370
|
cb(cur, "result_norm", -1);
|
10774
11371
|
|
10775
11372
|
// lm_head
|
@@ -10780,6 +11377,7 @@ struct llm_build_context {
|
|
10780
11377
|
|
10781
11378
|
return gf;
|
10782
11379
|
}
|
11380
|
+
|
10783
11381
|
};
|
10784
11382
|
|
10785
11383
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -10896,10 +11494,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10896
11494
|
{
|
10897
11495
|
result = llm.build_starcoder();
|
10898
11496
|
} break;
|
10899
|
-
case LLM_ARCH_PERSIMMON:
|
10900
|
-
{
|
10901
|
-
result = llm.build_persimmon();
|
10902
|
-
} break;
|
10903
11497
|
case LLM_ARCH_REFACT:
|
10904
11498
|
{
|
10905
11499
|
result = llm.build_refact();
|
@@ -10994,6 +11588,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10994
11588
|
{
|
10995
11589
|
result = llm.build_olmo();
|
10996
11590
|
} break;
|
11591
|
+
case LLM_ARCH_GPTNEOX:
|
11592
|
+
{
|
11593
|
+
result = llm.build_gptneox();
|
11594
|
+
} break;
|
11595
|
+
case LLM_ARCH_ARCTIC:
|
11596
|
+
{
|
11597
|
+
result = llm.build_arctic();
|
11598
|
+
} break;
|
11599
|
+
case LLM_ARCH_DEEPSEEK2:
|
11600
|
+
{
|
11601
|
+
result = llm.build_deepseek2();
|
11602
|
+
} break;
|
10997
11603
|
default:
|
10998
11604
|
GGML_ASSERT(false);
|
10999
11605
|
}
|
@@ -11339,11 +11945,6 @@ static void llama_graph_compute(
|
|
11339
11945
|
llama_context & lctx,
|
11340
11946
|
ggml_cgraph * gf,
|
11341
11947
|
int n_threads) {
|
11342
|
-
#ifdef GGML_USE_MPI
|
11343
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
11344
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
11345
|
-
#endif
|
11346
|
-
|
11347
11948
|
#ifdef GGML_USE_METAL
|
11348
11949
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
11349
11950
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -11358,10 +11959,6 @@ static void llama_graph_compute(
|
|
11358
11959
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11359
11960
|
|
11360
11961
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
11361
|
-
|
11362
|
-
#ifdef GGML_USE_MPI
|
11363
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
11364
|
-
#endif
|
11365
11962
|
}
|
11366
11963
|
|
11367
11964
|
// decode a batch of tokens by evaluating the transformer
|
@@ -11399,12 +11996,6 @@ static int llama_decode_internal(
|
|
11399
11996
|
}
|
11400
11997
|
lctx.n_queued_tokens += n_tokens_all;
|
11401
11998
|
|
11402
|
-
#ifdef GGML_USE_MPI
|
11403
|
-
// TODO: needs fix after #3228
|
11404
|
-
GGML_ASSERT(false && "not implemented");
|
11405
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
11406
|
-
#endif
|
11407
|
-
|
11408
11999
|
auto & kv_self = lctx.kv_self;
|
11409
12000
|
|
11410
12001
|
const int64_t n_embd = hparams.n_embd;
|
@@ -12298,6 +12889,7 @@ struct llm_tokenizer_bpe {
|
|
12298
12889
|
});
|
12299
12890
|
break;
|
12300
12891
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12892
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
12301
12893
|
word_collection = unicode_regex_split(text, {
|
12302
12894
|
// same as llama3
|
12303
12895
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12354,6 +12946,7 @@ struct llm_tokenizer_bpe {
|
|
12354
12946
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12355
12947
|
});
|
12356
12948
|
break;
|
12949
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
12357
12950
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12358
12951
|
word_collection = unicode_regex_split(text, {
|
12359
12952
|
// original regex from tokenizer.json
|
@@ -12519,7 +13112,7 @@ struct llm_tokenizer_wpm {
|
|
12519
13112
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
12520
13113
|
|
12521
13114
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12522
|
-
auto
|
13115
|
+
const auto & token_map = vocab.token_to_id;
|
12523
13116
|
|
12524
13117
|
// normalize and split by whitespace
|
12525
13118
|
std::vector<std::string> words = preprocess(text);
|
@@ -12534,108 +13127,89 @@ struct llm_tokenizer_wpm {
|
|
12534
13127
|
}
|
12535
13128
|
|
12536
13129
|
// prepend phantom space
|
12537
|
-
std::string word1 = "\xe2\x96\x81" + word;
|
12538
|
-
int n = word1.size();
|
13130
|
+
const std::string word1 = "\xe2\x96\x81" + word;
|
13131
|
+
const int n = word1.size();
|
12539
13132
|
|
12540
|
-
|
12541
|
-
int i = 0;
|
12542
|
-
bool match_any = false;
|
13133
|
+
const size_t current_tokens = output.size();
|
12543
13134
|
|
13135
|
+
// we're at the start of a new word
|
12544
13136
|
// move through character position in word
|
12545
|
-
|
13137
|
+
for (int i = 0; i < n; ++i) {
|
12546
13138
|
// loop through possible match length
|
12547
13139
|
bool match = false;
|
12548
13140
|
for (int j = n; j > i; j--) {
|
12549
|
-
auto it = token_map
|
12550
|
-
if (it != token_map
|
13141
|
+
auto it = token_map.find(word1.substr(i, j - i));
|
13142
|
+
if (it != token_map.end()) {
|
12551
13143
|
output.push_back(it->second);
|
12552
13144
|
match = true;
|
12553
|
-
|
12554
|
-
i = j;
|
13145
|
+
i = j - 1;
|
12555
13146
|
break;
|
12556
13147
|
}
|
12557
13148
|
}
|
12558
13149
|
|
12559
|
-
|
12560
|
-
|
12561
|
-
|
13150
|
+
if (!match) { // discard all
|
13151
|
+
output.resize(current_tokens);
|
13152
|
+
break; // and discard next tokens
|
12562
13153
|
}
|
12563
13154
|
}
|
12564
13155
|
|
12565
13156
|
// we didn't find any matches for this word
|
12566
|
-
if (
|
13157
|
+
if (current_tokens == output.size()) {
|
12567
13158
|
output.push_back(vocab.special_unk_id);
|
12568
13159
|
}
|
12569
13160
|
}
|
12570
13161
|
}
|
12571
13162
|
|
12572
13163
|
std::vector<std::string> preprocess(const std::string & text) {
|
12573
|
-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
12574
|
-
|
12575
|
-
|
12576
|
-
|
12577
|
-
|
12578
|
-
|
12579
|
-
|
12580
|
-
|
13164
|
+
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13165
|
+
std::vector<std::string> words(1, "");
|
13166
|
+
|
13167
|
+
for (const char32_t cpt : cpts_nfd) {
|
13168
|
+
const auto flags = unicode_cpt_flags(cpt);
|
13169
|
+
|
13170
|
+
if (flags.is_whitespace) {
|
13171
|
+
if (words.back().size()) { // finish previous word if any
|
13172
|
+
words.emplace_back();
|
13173
|
+
}
|
12581
13174
|
continue;
|
12582
13175
|
}
|
12583
|
-
|
12584
|
-
|
12585
|
-
|
12586
|
-
|
12587
|
-
std::string s = unicode_cpt_to_utf8(code);
|
12588
|
-
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12589
|
-
new_str += " ";
|
12590
|
-
new_str += s;
|
12591
|
-
new_str += " ";
|
12592
|
-
} else {
|
12593
|
-
new_str += s;
|
13176
|
+
|
13177
|
+
assert (!flags.is_separator);
|
13178
|
+
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
|
13179
|
+
continue;
|
12594
13180
|
}
|
12595
|
-
}
|
12596
13181
|
|
12597
|
-
|
12598
|
-
|
12599
|
-
|
12600
|
-
|
12601
|
-
|
12602
|
-
|
12603
|
-
|
12604
|
-
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
12605
|
-
l = r + 1;
|
12606
|
-
r = l;
|
13182
|
+
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
13183
|
+
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
13184
|
+
if (words.back().size()) { // finish previous word if any
|
13185
|
+
words.emplace_back();
|
13186
|
+
}
|
13187
|
+
words.back() = s; // single char word
|
13188
|
+
words.emplace_back(); // start a new word
|
12607
13189
|
} else {
|
12608
|
-
|
13190
|
+
words.back() += s; // append char to word
|
12609
13191
|
}
|
12610
13192
|
}
|
12611
|
-
if (r > l) {
|
12612
|
-
words.push_back(new_str.substr(l, (r - l)));
|
12613
|
-
}
|
12614
|
-
return words;
|
12615
|
-
}
|
12616
13193
|
|
12617
|
-
|
12618
|
-
|
12619
|
-
return false;
|
13194
|
+
if (!words.back().size()) {
|
13195
|
+
words.pop_back();
|
12620
13196
|
}
|
12621
|
-
|
12622
|
-
return
|
13197
|
+
|
13198
|
+
return words;
|
12623
13199
|
}
|
12624
13200
|
|
12625
|
-
bool is_chinese_char(uint32_t cpt) {
|
12626
|
-
|
12627
|
-
(cpt >=
|
13201
|
+
static bool is_chinese_char(uint32_t cpt) {
|
13202
|
+
return
|
13203
|
+
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
|
13204
|
+
(cpt >= 0x03400 && cpt <= 0x04DBF) ||
|
12628
13205
|
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
12629
13206
|
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
12630
13207
|
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
12631
13208
|
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
12632
|
-
(cpt >=
|
12633
|
-
(cpt >= 0x2F800 && cpt <= 0x2FA1F)
|
12634
|
-
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
12635
|
-
(cpt >= 0xFF00 && cpt <= 0xFFEF)
|
12636
|
-
return true; // NOLINT
|
12637
|
-
}
|
12638
|
-
return false;
|
13209
|
+
(cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
|
13210
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F);
|
13211
|
+
//(cpt >= 0x3000 && cpt <= 0x303F) ||
|
13212
|
+
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
12639
13213
|
}
|
12640
13214
|
|
12641
13215
|
const llama_vocab & vocab;
|
@@ -12679,9 +13253,8 @@ struct fragment_buffer_variant {
|
|
12679
13253
|
|
12680
13254
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
12681
13255
|
// for each special token
|
12682
|
-
for (const
|
12683
|
-
const auto & special_token =
|
12684
|
-
const auto & special_id = st.second;
|
13256
|
+
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
13257
|
+
const auto & special_token = vocab.id_to_token[special_id].text;
|
12685
13258
|
|
12686
13259
|
// for each text fragment
|
12687
13260
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
@@ -12690,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12690
13263
|
|
12691
13264
|
// if a fragment is text ( not yet processed )
|
12692
13265
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
12693
|
-
auto
|
13266
|
+
auto & raw_text = fragment.raw_text;
|
12694
13267
|
|
12695
13268
|
auto raw_text_base_offset = fragment.offset;
|
12696
13269
|
auto raw_text_base_length = fragment.length;
|
@@ -12700,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12700
13273
|
// find the first occurrence of a given special token in this fragment
|
12701
13274
|
// passing offset argument only limit the "search area" but match coordinates
|
12702
13275
|
// are still relative to the source full raw_text
|
12703
|
-
auto match = raw_text
|
13276
|
+
auto match = raw_text.find(special_token, raw_text_base_offset);
|
12704
13277
|
|
12705
13278
|
// no occurrences found, stop processing this fragment for a given special token
|
12706
13279
|
if (match == std::string::npos) break;
|
@@ -12719,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12719
13292
|
// left
|
12720
13293
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
12721
13294
|
const int64_t left_reminder_length = match - raw_text_base_offset;
|
12722
|
-
buffer.emplace_after(it,
|
13295
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
12723
13296
|
|
12724
13297
|
#ifdef PRETOKENIZERDEBUG
|
12725
13298
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
@@ -12735,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12735
13308
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
12736
13309
|
const int64_t right_reminder_offset = match + special_token.length();
|
12737
13310
|
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
12738
|
-
buffer.emplace_after(it,
|
13311
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
12739
13312
|
|
12740
13313
|
#ifdef PRETOKENIZERDEBUG
|
12741
13314
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
@@ -12788,9 +13361,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12788
13361
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12789
13362
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
12790
13363
|
|
13364
|
+
static const bool rtrim = true; //TODO: as param
|
13365
|
+
bool is_prev_special = false;
|
13366
|
+
bool special_token_rtrim = false;
|
13367
|
+
|
12791
13368
|
if (add_special && vocab.special_add_bos != 0) {
|
12792
13369
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12793
13370
|
output.push_back(vocab.special_bos_id);
|
13371
|
+
is_prev_special = true;
|
12794
13372
|
}
|
12795
13373
|
|
12796
13374
|
for (const auto & fragment : fragment_buffer) {
|
@@ -12802,9 +13380,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12802
13380
|
// and passing 'add space prefix' as bool argument
|
12803
13381
|
//
|
12804
13382
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
12805
|
-
|
12806
|
-
|
12807
|
-
|
13383
|
+
|
13384
|
+
if (special_token_rtrim) {
|
13385
|
+
size_t num_whitespaces = 0;
|
13386
|
+
while (isspace(raw_text[num_whitespaces])) {
|
13387
|
+
num_whitespaces++;
|
13388
|
+
}
|
13389
|
+
if (num_whitespaces == raw_text.size()) {
|
13390
|
+
continue; // skip if all whitespaces
|
13391
|
+
}
|
13392
|
+
raw_text = raw_text.substr(num_whitespaces);
|
13393
|
+
}
|
13394
|
+
|
13395
|
+
if (vocab.add_space_prefix) {
|
13396
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13397
|
+
raw_text = " " + raw_text;
|
12808
13398
|
}
|
12809
13399
|
}
|
12810
13400
|
|
@@ -12816,6 +13406,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12816
13406
|
tokenizer.tokenize(raw_text, output);
|
12817
13407
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
12818
13408
|
output.push_back(fragment.token);
|
13409
|
+
is_prev_special = true;
|
13410
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13411
|
+
special_token_rtrim = rtrim
|
13412
|
+
&& fragment.token != vocab.special_bos_id
|
13413
|
+
&& fragment.token != vocab.special_unk_id
|
13414
|
+
&& fragment.token != vocab.special_eos_id;
|
12819
13415
|
}
|
12820
13416
|
}
|
12821
13417
|
|
@@ -13816,7 +14412,7 @@ void llama_sample_repetition_penalties(
|
|
13816
14412
|
|
13817
14413
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
13818
14414
|
GGML_ASSERT(ctx);
|
13819
|
-
|
14415
|
+
int64_t t_start_sample_us = ggml_time_us();
|
13820
14416
|
|
13821
14417
|
bool allow_eog = false;
|
13822
14418
|
for (const auto & stack : grammar->stacks) {
|
@@ -13828,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13828
14424
|
|
13829
14425
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
13830
14426
|
candidates_decoded.reserve(candidates->size);
|
13831
|
-
|
14427
|
+
|
14428
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
13832
14429
|
candidates_grammar.reserve(candidates->size);
|
13833
14430
|
|
13834
14431
|
for (size_t i = 0; i < candidates->size; ++i) {
|
13835
|
-
const llama_token id
|
13836
|
-
const std::string piece =
|
14432
|
+
const llama_token id = candidates->data[i].id;
|
14433
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
13837
14434
|
|
13838
14435
|
if (llama_token_is_eog(&ctx->model, id)) {
|
13839
14436
|
if (!allow_eog) {
|
@@ -14033,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14033
14630
|
GGML_ASSERT(false);
|
14034
14631
|
}
|
14035
14632
|
|
14036
|
-
const std::string piece =
|
14633
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
14037
14634
|
|
14038
14635
|
// Note terminating 0 in decoded string
|
14039
14636
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14518,8 +15115,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14518
15115
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
14519
15116
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
14520
15117
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
14521
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
14522
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
14523
15118
|
if (qs.model.type == MODEL_70B) {
|
14524
15119
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
14525
15120
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
@@ -15533,10 +16128,6 @@ void llama_backend_init(void) {
|
|
15533
16128
|
struct ggml_context * ctx = ggml_init(params);
|
15534
16129
|
ggml_free(ctx);
|
15535
16130
|
}
|
15536
|
-
|
15537
|
-
#ifdef GGML_USE_MPI
|
15538
|
-
ggml_mpi_backend_init();
|
15539
|
-
#endif
|
15540
16131
|
}
|
15541
16132
|
|
15542
16133
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
@@ -15546,9 +16137,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
15546
16137
|
}
|
15547
16138
|
|
15548
16139
|
void llama_backend_free(void) {
|
15549
|
-
#ifdef GGML_USE_MPI
|
15550
|
-
ggml_mpi_backend_free();
|
15551
|
-
#endif
|
15552
16140
|
ggml_quantize_free();
|
15553
16141
|
}
|
15554
16142
|
|
@@ -15691,6 +16279,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15691
16279
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
15692
16280
|
}
|
15693
16281
|
|
16282
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
15694
16283
|
cparams.causal_attn = hparams.causal_attn;
|
15695
16284
|
|
15696
16285
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
@@ -15949,20 +16538,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15949
16538
|
}
|
15950
16539
|
}
|
15951
16540
|
|
15952
|
-
#ifdef GGML_USE_MPI
|
15953
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
15954
|
-
|
15955
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
15956
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
15957
|
-
// TODO: needs fix after #3228
|
15958
|
-
GGML_ASSERT(false && "not implemented");
|
15959
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
15960
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
15961
|
-
llama_backend_free();
|
15962
|
-
exit(1);
|
15963
|
-
}
|
15964
|
-
#endif
|
15965
|
-
|
15966
16541
|
return ctx;
|
15967
16542
|
}
|
15968
16543
|
|
@@ -15999,7 +16574,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15999
16574
|
// these models do not use RoPE
|
16000
16575
|
case LLM_ARCH_GPT2:
|
16001
16576
|
case LLM_ARCH_GPTJ:
|
16002
|
-
case LLM_ARCH_GPTNEOX:
|
16003
16577
|
case LLM_ARCH_MPT:
|
16004
16578
|
case LLM_ARCH_REFACT:
|
16005
16579
|
case LLM_ARCH_BLOOM:
|
@@ -16019,13 +16593,14 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16019
16593
|
case LLM_ARCH_XVERSE:
|
16020
16594
|
case LLM_ARCH_COMMAND_R:
|
16021
16595
|
case LLM_ARCH_OLMO:
|
16596
|
+
case LLM_ARCH_ARCTIC:
|
16597
|
+
case LLM_ARCH_DEEPSEEK2:
|
16022
16598
|
return LLAMA_ROPE_TYPE_NORM;
|
16023
16599
|
|
16024
16600
|
// the pairs of head values are offset by n_rot/2
|
16025
16601
|
case LLM_ARCH_FALCON:
|
16026
16602
|
case LLM_ARCH_GROK:
|
16027
16603
|
case LLM_ARCH_DBRX:
|
16028
|
-
case LLM_ARCH_PERSIMMON:
|
16029
16604
|
case LLM_ARCH_BERT:
|
16030
16605
|
case LLM_ARCH_NOMIC_BERT:
|
16031
16606
|
case LLM_ARCH_STABLELM:
|
@@ -16036,6 +16611,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16036
16611
|
case LLM_ARCH_PHI3:
|
16037
16612
|
case LLM_ARCH_GEMMA:
|
16038
16613
|
case LLM_ARCH_STARCODER2:
|
16614
|
+
case LLM_ARCH_GPTNEOX:
|
16039
16615
|
return LLAMA_ROPE_TYPE_NEOX;
|
16040
16616
|
|
16041
16617
|
// all model arches should be listed explicitly here
|
@@ -16195,6 +16771,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16195
16771
|
}
|
16196
16772
|
|
16197
16773
|
// make tensors
|
16774
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
16198
16775
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
16199
16776
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
16200
16777
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
@@ -16203,6 +16780,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16203
16780
|
}
|
16204
16781
|
|
16205
16782
|
// allocate tensors / buffers and zero
|
16783
|
+
cvec.ctxs.reserve(ctx_map.size());
|
16784
|
+
cvec.bufs.reserve(ctx_map.size());
|
16206
16785
|
for (auto it : ctx_map) {
|
16207
16786
|
ggml_backend_buffer_type_t buft = it.first;
|
16208
16787
|
ggml_context * ctx = it.second;
|
@@ -17411,6 +17990,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
17411
17990
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
17412
17991
|
}
|
17413
17992
|
|
17993
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
17994
|
+
return ctx->cparams.n_threads;
|
17995
|
+
}
|
17996
|
+
|
17997
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
17998
|
+
return ctx->cparams.n_threads_batch;
|
17999
|
+
}
|
18000
|
+
|
17414
18001
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
17415
18002
|
ctx->abort_callback = abort_callback;
|
17416
18003
|
ctx->abort_callback_data = abort_callback_data;
|
@@ -17634,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
17634
18221
|
);
|
17635
18222
|
}
|
17636
18223
|
|
18224
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
18225
|
+
return llama_is_control_token(model->vocab, token);
|
18226
|
+
}
|
18227
|
+
|
17637
18228
|
llama_token llama_token_bos(const struct llama_model * model) {
|
17638
18229
|
return model->vocab.special_bos_id;
|
17639
18230
|
}
|
@@ -17705,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17705
18296
|
|
17706
18297
|
const auto cpts = unicode_cpts_from_utf8(text);
|
17707
18298
|
for (const auto cpt : cpts) {
|
17708
|
-
|
18299
|
+
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
18300
|
+
try {
|
18301
|
+
decoded_text += unicode_utf8_to_byte(utf8);
|
18302
|
+
} catch (const std::out_of_range & e) {
|
18303
|
+
decoded_text += "[UNK_BYTE_0x";
|
18304
|
+
for (const auto c : utf8) {
|
18305
|
+
decoded_text += format("%02x", (uint8_t) c);
|
18306
|
+
}
|
18307
|
+
decoded_text += text + "]";
|
18308
|
+
}
|
17709
18309
|
}
|
17710
18310
|
|
17711
18311
|
return decoded_text;
|
@@ -17713,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17713
18313
|
|
17714
18314
|
// does not write null-terminator to buf
|
17715
18315
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
18316
|
+
// if we have a cache - use it
|
18317
|
+
{
|
18318
|
+
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
18319
|
+
|
18320
|
+
if (!cache.empty()) {
|
18321
|
+
const auto & res = cache.at(token);
|
18322
|
+
if (length < (int) res.size()) {
|
18323
|
+
return -(int) res.size();
|
18324
|
+
}
|
18325
|
+
memcpy(buf, res.c_str(), res.size());
|
18326
|
+
return res.size();
|
18327
|
+
}
|
18328
|
+
}
|
18329
|
+
|
17716
18330
|
if (0 <= token && token < llama_n_vocab(model)) {
|
17717
18331
|
switch (llama_vocab_get_type(model->vocab)) {
|
17718
|
-
|
17719
|
-
|
17720
|
-
|
17721
|
-
|
17722
|
-
|
17723
|
-
|
17724
|
-
|
17725
|
-
|
17726
|
-
|
17727
|
-
|
17728
|
-
|
17729
|
-
|
17730
|
-
|
17731
|
-
|
17732
|
-
|
17733
|
-
|
17734
|
-
|
17735
|
-
|
17736
|
-
|
17737
|
-
|
17738
|
-
|
17739
|
-
|
17740
|
-
|
17741
|
-
|
17742
|
-
|
17743
|
-
|
17744
|
-
|
17745
|
-
|
17746
|
-
|
17747
|
-
|
18332
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
18333
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
18334
|
+
// NOTE: we accept all unsupported token types,
|
18335
|
+
// suppressing them like CONTROL tokens.
|
18336
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18337
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18338
|
+
llama_unescape_whitespace(result);
|
18339
|
+
if (length < (int) result.length()) {
|
18340
|
+
return -(int) result.length();
|
18341
|
+
}
|
18342
|
+
memcpy(buf, result.c_str(), result.length());
|
18343
|
+
return result.length();
|
18344
|
+
} else if (
|
18345
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18346
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18347
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18348
|
+
if (length < (int) result.length()) {
|
18349
|
+
return -(int) result.length();
|
18350
|
+
}
|
18351
|
+
memcpy(buf, result.c_str(), result.length());
|
18352
|
+
return result.length();
|
18353
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
18354
|
+
if (length < 3) {
|
18355
|
+
return -3;
|
18356
|
+
}
|
18357
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
18358
|
+
return 3;
|
18359
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
18360
|
+
if (length < 1) {
|
18361
|
+
return -1;
|
18362
|
+
}
|
18363
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
18364
|
+
return 1;
|
17748
18365
|
}
|
17749
|
-
|
17750
|
-
return 1;
|
18366
|
+
break;
|
17751
18367
|
}
|
17752
|
-
|
17753
|
-
|
17754
|
-
|
17755
|
-
|
17756
|
-
|
17757
|
-
|
17758
|
-
|
17759
|
-
|
17760
|
-
|
17761
|
-
|
17762
|
-
|
17763
|
-
|
17764
|
-
|
17765
|
-
|
17766
|
-
|
17767
|
-
(
|
17768
|
-
|
17769
|
-
|
17770
|
-
|
18368
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
18369
|
+
// NOTE: we accept all unsupported token types,
|
18370
|
+
// suppressing them like CONTROL tokens.
|
18371
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18372
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18373
|
+
result = llama_decode_text(result);
|
18374
|
+
if (length < (int) result.length()) {
|
18375
|
+
return -(int) result.length();
|
18376
|
+
}
|
18377
|
+
memcpy(buf, result.c_str(), result.length());
|
18378
|
+
return result.length();
|
18379
|
+
} else if (
|
18380
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18381
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18382
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18383
|
+
if (length < (int) result.length()) {
|
18384
|
+
return -(int) result.length();
|
18385
|
+
}
|
18386
|
+
memcpy(buf, result.c_str(), result.length());
|
18387
|
+
return result.length();
|
17771
18388
|
}
|
17772
|
-
|
17773
|
-
return result.length();
|
18389
|
+
break;
|
17774
18390
|
}
|
17775
|
-
|
17776
|
-
|
17777
|
-
default:
|
17778
|
-
GGML_ASSERT(false);
|
18391
|
+
default:
|
18392
|
+
GGML_ASSERT(false);
|
17779
18393
|
}
|
17780
18394
|
}
|
17781
18395
|
return 0;
|
@@ -17845,6 +18459,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
17845
18459
|
}
|
17846
18460
|
}
|
17847
18461
|
// llama2 templates seem to not care about "add_generation_prompt"
|
18462
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
18463
|
+
// Phi 3
|
18464
|
+
for (auto message : chat) {
|
18465
|
+
std::string role(message->role);
|
18466
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
18467
|
+
}
|
18468
|
+
if (add_ass) {
|
18469
|
+
ss << "<|assistant|>\n";
|
18470
|
+
}
|
17848
18471
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
17849
18472
|
// zephyr template
|
17850
18473
|
for (auto message : chat) {
|
@@ -17977,15 +18600,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
17977
18600
|
if (add_ass) {
|
17978
18601
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17979
18602
|
}
|
17980
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17981
|
-
// Phi 3
|
17982
|
-
for (auto message : chat) {
|
17983
|
-
std::string role(message->role);
|
17984
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17985
|
-
}
|
17986
|
-
if (add_ass) {
|
17987
|
-
ss << "<|assistant|>\n";
|
17988
|
-
}
|
17989
18603
|
} else {
|
17990
18604
|
// template not supported
|
17991
18605
|
return -1;
|
@@ -18107,8 +18721,10 @@ const char * llama_print_system_info(void) {
|
|
18107
18721
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
18108
18722
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
18109
18723
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
18724
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18110
18725
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18111
18726
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18727
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
18112
18728
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
18113
18729
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
18114
18730
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
@@ -18167,6 +18783,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
18167
18783
|
g_state.log_callback_user_data = user_data;
|
18168
18784
|
#ifdef GGML_USE_METAL
|
18169
18785
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18786
|
+
#elif defined(GGML_USE_CUDA)
|
18787
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18170
18788
|
#endif
|
18171
18789
|
}
|
18172
18790
|
|