llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -197,6 +197,7 @@ enum llm_arch {
|
|
197
197
|
LLM_ARCH_PERSIMMON,
|
198
198
|
LLM_ARCH_REFACT,
|
199
199
|
LLM_ARCH_BERT,
|
200
|
+
LLM_ARCH_NOMIC_BERT,
|
200
201
|
LLM_ARCH_BLOOM,
|
201
202
|
LLM_ARCH_STABLELM,
|
202
203
|
LLM_ARCH_QWEN,
|
@@ -207,31 +208,34 @@ enum llm_arch {
|
|
207
208
|
LLM_ARCH_ORION,
|
208
209
|
LLM_ARCH_INTERNLM2,
|
209
210
|
LLM_ARCH_MINICPM,
|
211
|
+
LLM_ARCH_GEMMA,
|
210
212
|
LLM_ARCH_UNKNOWN,
|
211
213
|
};
|
212
214
|
|
213
215
|
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
214
|
-
{ LLM_ARCH_LLAMA, "llama"
|
215
|
-
{ LLM_ARCH_FALCON, "falcon"
|
216
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
217
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
218
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
219
|
-
{ LLM_ARCH_MPT, "mpt"
|
220
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
221
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
222
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
223
|
-
{ LLM_ARCH_REFACT, "refact"
|
224
|
-
{ LLM_ARCH_BERT, "bert"
|
225
|
-
{
|
226
|
-
{
|
227
|
-
{
|
228
|
-
{
|
229
|
-
{
|
230
|
-
{
|
231
|
-
{
|
232
|
-
{
|
233
|
-
{
|
234
|
-
{
|
216
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
217
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
218
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
219
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
220
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
221
|
+
{ LLM_ARCH_MPT, "mpt" },
|
222
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
223
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
224
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
225
|
+
{ LLM_ARCH_REFACT, "refact" },
|
226
|
+
{ LLM_ARCH_BERT, "bert" },
|
227
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
228
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
229
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
230
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
231
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
232
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
233
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
234
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
235
|
+
{ LLM_ARCH_ORION, "orion" },
|
236
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
237
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
238
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
235
239
|
};
|
236
240
|
|
237
241
|
enum llm_kv {
|
@@ -254,7 +258,7 @@ enum llm_kv {
|
|
254
258
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
255
259
|
LLM_KV_EXPERT_COUNT,
|
256
260
|
LLM_KV_EXPERT_USED_COUNT,
|
257
|
-
|
261
|
+
LLM_KV_POOLING_TYPE,
|
258
262
|
|
259
263
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
260
264
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -312,7 +316,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
312
316
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
313
317
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
314
318
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
315
|
-
{
|
319
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
316
320
|
|
317
321
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
318
322
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -375,6 +379,7 @@ enum llm_tensor {
|
|
375
379
|
LLM_TENSOR_ATTN_OUT,
|
376
380
|
LLM_TENSOR_ATTN_NORM,
|
377
381
|
LLM_TENSOR_ATTN_NORM_2,
|
382
|
+
LLM_TENSOR_ATTN_OUT_NORM,
|
378
383
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
379
384
|
LLM_TENSOR_FFN_GATE_INP,
|
380
385
|
LLM_TENSOR_FFN_NORM,
|
@@ -387,6 +392,7 @@ enum llm_tensor {
|
|
387
392
|
LLM_TENSOR_FFN_UP_EXP,
|
388
393
|
LLM_TENSOR_ATTN_Q_NORM,
|
389
394
|
LLM_TENSOR_ATTN_K_NORM,
|
395
|
+
LLM_TENSOR_LAYER_OUT_NORM,
|
390
396
|
};
|
391
397
|
|
392
398
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -503,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
503
509
|
{
|
504
510
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
505
511
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
506
|
-
{ LLM_TENSOR_OUTPUT, "output" },
|
507
512
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
508
513
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
509
514
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
@@ -552,12 +557,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
552
557
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
553
558
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
554
559
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
555
|
-
{
|
560
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
556
561
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
557
562
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
558
563
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
559
564
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
560
|
-
{
|
565
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
566
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
567
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
|
+
},
|
569
|
+
},
|
570
|
+
{
|
571
|
+
LLM_ARCH_NOMIC_BERT,
|
572
|
+
{
|
573
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
574
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
575
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
576
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
577
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
578
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
579
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
580
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
561
581
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
562
582
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
563
583
|
},
|
@@ -741,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
741
761
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
742
762
|
},
|
743
763
|
},
|
764
|
+
{
|
765
|
+
LLM_ARCH_GEMMA,
|
766
|
+
{
|
767
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
768
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
769
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
770
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
771
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
772
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
773
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
774
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
777
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
778
|
+
},
|
779
|
+
},
|
744
780
|
{
|
745
781
|
LLM_ARCH_UNKNOWN,
|
746
782
|
{
|
@@ -1015,7 +1051,7 @@ struct llama_mmap {
|
|
1015
1051
|
int fd = fileno(file->fp);
|
1016
1052
|
int flags = MAP_SHARED;
|
1017
1053
|
// prefetch/readahead impairs performance on NUMA systems
|
1018
|
-
if (numa)
|
1054
|
+
if (numa) { prefetch = 0; }
|
1019
1055
|
#ifdef __linux__
|
1020
1056
|
// advise the kernel to read the file sequentially (increases readahead)
|
1021
1057
|
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
@@ -1485,6 +1521,7 @@ enum e_model {
|
|
1485
1521
|
MODEL_22M,
|
1486
1522
|
MODEL_33M,
|
1487
1523
|
MODEL_109M,
|
1524
|
+
MODEL_137M,
|
1488
1525
|
MODEL_335M,
|
1489
1526
|
MODEL_0_5B,
|
1490
1527
|
MODEL_1B,
|
@@ -1537,12 +1574,13 @@ struct llama_hparams {
|
|
1537
1574
|
uint32_t n_yarn_orig_ctx;
|
1538
1575
|
int32_t rope_scaling_type_train;
|
1539
1576
|
|
1540
|
-
float f_clamp_kqv;
|
1541
|
-
float f_max_alibi_bias;
|
1577
|
+
float f_clamp_kqv = 0.0f;
|
1578
|
+
float f_max_alibi_bias = 0.0f;
|
1542
1579
|
|
1543
1580
|
bool causal_attn = true;
|
1544
|
-
bool
|
1581
|
+
bool need_kq_pos = false;
|
1545
1582
|
|
1583
|
+
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
1546
1584
|
|
1547
1585
|
bool operator!=(const llama_hparams & other) const {
|
1548
1586
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1620,6 +1658,8 @@ struct llama_layer {
|
|
1620
1658
|
struct ggml_tensor * attn_q_norm_b;
|
1621
1659
|
struct ggml_tensor * attn_k_norm;
|
1622
1660
|
struct ggml_tensor * attn_k_norm_b;
|
1661
|
+
struct ggml_tensor * attn_out_norm;
|
1662
|
+
struct ggml_tensor * attn_out_norm_b;
|
1623
1663
|
|
1624
1664
|
// attention
|
1625
1665
|
struct ggml_tensor * wq;
|
@@ -1638,6 +1678,8 @@ struct llama_layer {
|
|
1638
1678
|
// normalization
|
1639
1679
|
struct ggml_tensor * ffn_norm;
|
1640
1680
|
struct ggml_tensor * ffn_norm_b;
|
1681
|
+
struct ggml_tensor * layer_out_norm;
|
1682
|
+
struct ggml_tensor * layer_out_norm_b;
|
1641
1683
|
|
1642
1684
|
// ff
|
1643
1685
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1899,8 +1941,10 @@ struct llama_context {
|
|
1899
1941
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
1900
1942
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1901
1943
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1944
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
1902
1945
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1903
|
-
struct ggml_tensor *
|
1946
|
+
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
1947
|
+
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
1904
1948
|
|
1905
1949
|
#ifdef GGML_USE_MPI
|
1906
1950
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2499,6 +2543,8 @@ struct llama_model_loader {
|
|
2499
2543
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2500
2544
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2501
2545
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
|
+
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
|
+
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2502
2548
|
default:
|
2503
2549
|
{
|
2504
2550
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2744,13 +2790,7 @@ struct llama_model_loader {
|
|
2744
2790
|
|
2745
2791
|
std::vector<no_init<uint8_t>> read_buf;
|
2746
2792
|
|
2747
|
-
for (
|
2748
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2749
|
-
if (!cur) {
|
2750
|
-
// some tensors may be allocated in a different context
|
2751
|
-
continue;
|
2752
|
-
}
|
2753
|
-
|
2793
|
+
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
2754
2794
|
if (progress_callback) {
|
2755
2795
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
2756
2796
|
return false;
|
@@ -2848,6 +2888,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2848
2888
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2849
2889
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
2850
2890
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2851
2893
|
|
2852
2894
|
default: return "unknown, may not work";
|
2853
2895
|
}
|
@@ -2855,6 +2897,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2855
2897
|
|
2856
2898
|
static const char * llama_model_type_name(e_model type) {
|
2857
2899
|
switch (type) {
|
2900
|
+
case MODEL_22M: return "22M";
|
2901
|
+
case MODEL_33M: return "33M";
|
2902
|
+
case MODEL_109M: return "109M";
|
2903
|
+
case MODEL_137M: return "137M";
|
2904
|
+
case MODEL_0_5B: return "0.5B";
|
2858
2905
|
case MODEL_1B: return "1B";
|
2859
2906
|
case MODEL_2B: return "2B";
|
2860
2907
|
case MODEL_3B: return "3B";
|
@@ -3024,6 +3071,11 @@ static void llm_load_hparams(
|
|
3024
3071
|
case 40: model.type = e_model::MODEL_13B; break;
|
3025
3072
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3026
3073
|
}
|
3074
|
+
|
3075
|
+
if (model.type == e_model::MODEL_13B) {
|
3076
|
+
// TODO: become GGUF KV parameter
|
3077
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3078
|
+
}
|
3027
3079
|
} break;
|
3028
3080
|
case LLM_ARCH_STARCODER:
|
3029
3081
|
{
|
@@ -3051,13 +3103,16 @@ static void llm_load_hparams(
|
|
3051
3103
|
case 32: model.type = e_model::MODEL_1B; break;
|
3052
3104
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3053
3105
|
}
|
3106
|
+
|
3107
|
+
// TODO: become GGUF KV parameter
|
3108
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3054
3109
|
} break;
|
3055
3110
|
case LLM_ARCH_BERT:
|
3056
3111
|
{
|
3057
3112
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3058
3113
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3059
3114
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3060
|
-
ml.get_key(
|
3115
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3061
3116
|
|
3062
3117
|
switch (hparams.n_layer) {
|
3063
3118
|
case 3:
|
@@ -3073,6 +3128,17 @@ static void llm_load_hparams(
|
|
3073
3128
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3074
3129
|
}
|
3075
3130
|
} break;
|
3131
|
+
case LLM_ARCH_NOMIC_BERT:
|
3132
|
+
{
|
3133
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3134
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
|
+
|
3138
|
+
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
|
+
model.type = e_model::MODEL_137M;
|
3140
|
+
}
|
3141
|
+
} break;
|
3076
3142
|
case LLM_ARCH_BLOOM:
|
3077
3143
|
{
|
3078
3144
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3085,11 +3151,12 @@ static void llm_load_hparams(
|
|
3085
3151
|
case 4096: model.type = e_model::MODEL_7B; break;
|
3086
3152
|
} break;
|
3087
3153
|
}
|
3154
|
+
|
3155
|
+
// TODO: become GGUF KV parameter
|
3156
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3088
3157
|
} break;
|
3089
3158
|
case LLM_ARCH_MPT:
|
3090
3159
|
{
|
3091
|
-
hparams.f_clamp_kqv = 0.0f;
|
3092
|
-
|
3093
3160
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3094
3161
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
3095
3162
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
@@ -3187,10 +3254,24 @@ static void llm_load_hparams(
|
|
3187
3254
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3188
3255
|
}
|
3189
3256
|
} break;
|
3257
|
+
case LLM_ARCH_GEMMA:
|
3258
|
+
{
|
3259
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3260
|
+
|
3261
|
+
switch (hparams.n_layer) {
|
3262
|
+
case 18: model.type = e_model::MODEL_2B; break;
|
3263
|
+
case 28: model.type = e_model::MODEL_7B; break;
|
3264
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3265
|
+
}
|
3266
|
+
} break;
|
3190
3267
|
default: (void)0;
|
3191
3268
|
}
|
3192
3269
|
|
3193
3270
|
model.ftype = ml.ftype;
|
3271
|
+
|
3272
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
|
+
hparams.need_kq_pos = true;
|
3274
|
+
}
|
3194
3275
|
}
|
3195
3276
|
|
3196
3277
|
// TODO: This should probably be in llama.h
|
@@ -3634,7 +3715,7 @@ static bool llm_load_tensors(
|
|
3634
3715
|
}
|
3635
3716
|
|
3636
3717
|
// create one context per buffer type
|
3637
|
-
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
3718
|
+
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
3638
3719
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
3639
3720
|
for (auto & it : buft_layer_count) {
|
3640
3721
|
struct ggml_init_params params = {
|
@@ -3772,6 +3853,7 @@ static bool llm_load_tensors(
|
|
3772
3853
|
} else {
|
3773
3854
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
3774
3855
|
ml.n_created--; // artificial tensor
|
3856
|
+
ml.size_data += ggml_nbytes(model.output);
|
3775
3857
|
}
|
3776
3858
|
}
|
3777
3859
|
|
@@ -3875,10 +3957,14 @@ static bool llm_load_tensors(
|
|
3875
3957
|
}
|
3876
3958
|
} break;
|
3877
3959
|
case LLM_ARCH_BERT:
|
3960
|
+
case LLM_ARCH_NOMIC_BERT:
|
3878
3961
|
{
|
3879
|
-
model.tok_embd
|
3880
|
-
model.type_embd
|
3881
|
-
model.
|
3962
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3963
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
3964
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3965
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3966
|
+
}
|
3967
|
+
|
3882
3968
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3883
3969
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3884
3970
|
|
@@ -3888,29 +3974,38 @@ static bool llm_load_tensors(
|
|
3888
3974
|
|
3889
3975
|
auto & layer = model.layers[i];
|
3890
3976
|
|
3891
|
-
|
3892
|
-
|
3977
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3978
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3979
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3893
3980
|
|
3894
|
-
|
3895
|
-
|
3981
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3982
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3896
3983
|
|
3897
|
-
|
3898
|
-
|
3984
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3985
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3986
|
+
} else {
|
3987
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3988
|
+
}
|
3899
3989
|
|
3900
|
-
layer.
|
3901
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3990
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3902
3991
|
|
3903
|
-
layer.
|
3904
|
-
layer.
|
3992
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
3993
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
3905
3994
|
|
3906
|
-
layer.
|
3907
|
-
layer.
|
3995
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3996
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3908
3997
|
|
3909
|
-
|
3910
|
-
|
3998
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3999
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4000
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3911
4001
|
|
3912
|
-
|
3913
|
-
|
4002
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4003
|
+
} else {
|
4004
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4005
|
+
}
|
4006
|
+
|
4007
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4008
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
3914
4009
|
}
|
3915
4010
|
} break;
|
3916
4011
|
case LLM_ARCH_BLOOM:
|
@@ -3958,7 +4053,12 @@ static bool llm_load_tensors(
|
|
3958
4053
|
// output
|
3959
4054
|
{
|
3960
4055
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3961
|
-
model.
|
4056
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
4057
|
+
|
4058
|
+
// same as tok_embd, duplicated to allow offloading
|
4059
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4060
|
+
ml.n_created--; // artificial tensor
|
4061
|
+
ml.size_data += ggml_nbytes(model.output);
|
3962
4062
|
}
|
3963
4063
|
|
3964
4064
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -3967,14 +4067,23 @@ static bool llm_load_tensors(
|
|
3967
4067
|
|
3968
4068
|
auto & layer = model.layers[i];
|
3969
4069
|
|
3970
|
-
layer.attn_norm
|
4070
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4071
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
3971
4072
|
|
3972
4073
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4074
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
4075
|
+
|
3973
4076
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4077
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
3974
4078
|
|
3975
|
-
layer.ffn_norm
|
3976
|
-
layer.
|
3977
|
-
|
4079
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4080
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4081
|
+
|
4082
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
4083
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
4084
|
+
|
4085
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4086
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
3978
4087
|
|
3979
4088
|
// AWQ ScaleActivation layer
|
3980
4089
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
@@ -4287,6 +4396,40 @@ static bool llm_load_tensors(
|
|
4287
4396
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4288
4397
|
}
|
4289
4398
|
} break;
|
4399
|
+
case LLM_ARCH_GEMMA:
|
4400
|
+
{
|
4401
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4402
|
+
|
4403
|
+
// output
|
4404
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4405
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
4406
|
+
ml.n_created--; // artificial tensor
|
4407
|
+
ml.size_data += ggml_nbytes(model.output);
|
4408
|
+
|
4409
|
+
const int64_t n_ff = hparams.n_ff;
|
4410
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4411
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4412
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4413
|
+
|
4414
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
4415
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4416
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4417
|
+
|
4418
|
+
auto & layer = model.layers[i];
|
4419
|
+
|
4420
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4421
|
+
|
4422
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
4423
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
4424
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
4425
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
4426
|
+
|
4427
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4428
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4429
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4430
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4431
|
+
}
|
4432
|
+
} break;
|
4290
4433
|
default:
|
4291
4434
|
throw std::runtime_error("unknown architecture");
|
4292
4435
|
}
|
@@ -4720,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4720
4863
|
struct ggml_tensor * wo_b,
|
4721
4864
|
struct ggml_tensor * q_cur,
|
4722
4865
|
struct ggml_tensor * kq_mask,
|
4866
|
+
struct ggml_tensor * kq_pos,
|
4723
4867
|
int64_t n_ctx,
|
4724
4868
|
int32_t n_tokens,
|
4725
4869
|
int32_t n_kv,
|
4726
|
-
float max_alibi_bias,
|
4727
4870
|
float kq_scale,
|
4728
4871
|
const llm_build_cb & cb,
|
4729
4872
|
int il) {
|
@@ -4753,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4753
4896
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4754
4897
|
}
|
4755
4898
|
|
4756
|
-
|
4757
|
-
|
4899
|
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
|
4900
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
|
4901
|
+
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
4758
4904
|
kq = ggml_scale(ctx, kq, kq_scale);
|
4759
4905
|
cb(kq, "kq_scaled", il);
|
4760
4906
|
|
4761
|
-
|
4762
|
-
|
4763
|
-
// TODO: K-shift is likely not working
|
4764
|
-
// TODO: change to ggml_add
|
4765
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
4766
|
-
cb(kq, "kq_scaled_alibi", il);
|
4767
|
-
}
|
4907
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
4908
|
+
cb(kq, "kq_scaled_alibi", il);
|
4768
4909
|
|
4769
4910
|
kq = ggml_add(ctx, kq, kq_mask);
|
4770
4911
|
cb(kq, "kq_masked", il);
|
4771
4912
|
|
4772
4913
|
kq = ggml_soft_max(ctx, kq);
|
4773
4914
|
cb(kq, "kq_soft_max", il);
|
4774
|
-
} else
|
4775
|
-
|
4915
|
+
} else
|
4916
|
+
#endif
|
4917
|
+
{
|
4918
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
4776
4919
|
cb(kq, "kq_soft_max_ext", il);
|
4777
4920
|
}
|
4778
4921
|
|
@@ -4820,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
|
|
4820
4963
|
struct ggml_tensor * v_cur,
|
4821
4964
|
struct ggml_tensor * q_cur,
|
4822
4965
|
struct ggml_tensor * kq_mask,
|
4966
|
+
struct ggml_tensor * kq_pos,
|
4823
4967
|
int64_t n_ctx,
|
4824
4968
|
int32_t n_tokens,
|
4825
4969
|
int32_t kv_head,
|
4826
4970
|
int32_t n_kv,
|
4827
|
-
float max_alibi_bias,
|
4828
4971
|
float kq_scale,
|
4829
4972
|
const llm_build_cb & cb,
|
4830
4973
|
int il) {
|
@@ -4838,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
|
|
4838
4981
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4839
4982
|
|
4840
4983
|
struct ggml_tensor * cur;
|
4841
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
4842
|
-
|
4843
|
-
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
4984
|
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4985
|
+
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4844
4986
|
cb(cur, "kqv_out", il);
|
4845
4987
|
|
4846
4988
|
return cur;
|
@@ -4881,7 +5023,7 @@ struct llm_build_context {
|
|
4881
5023
|
const int32_t n_orig_ctx;
|
4882
5024
|
|
4883
5025
|
const bool do_rope_shift;
|
4884
|
-
const
|
5026
|
+
const uint32_t pooling_type;
|
4885
5027
|
|
4886
5028
|
const llm_build_cb & cb;
|
4887
5029
|
|
@@ -4925,7 +5067,7 @@ struct llm_build_context {
|
|
4925
5067
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4926
5068
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4927
5069
|
do_rope_shift (worst_case || kv_self.has_shift),
|
4928
|
-
do_pooling
|
5070
|
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
4929
5071
|
cb (cb),
|
4930
5072
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4931
5073
|
// all initializations should be done in init()
|
@@ -5008,7 +5150,7 @@ struct llm_build_context {
|
|
5008
5150
|
}
|
5009
5151
|
|
5010
5152
|
Qcur = ggml_rope_custom(
|
5011
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
5153
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5012
5154
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
5013
5155
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5014
5156
|
);
|
@@ -5023,7 +5165,7 @@ struct llm_build_context {
|
|
5023
5165
|
|
5024
5166
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5025
5167
|
model.layers[il].wo, model.layers[il].bo,
|
5026
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5168
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5027
5169
|
cb(cur, "kqv_out", il);
|
5028
5170
|
}
|
5029
5171
|
|
@@ -5153,6 +5295,10 @@ struct llm_build_context {
|
|
5153
5295
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5154
5296
|
cb(KQ_mask, "KQ_mask", -1);
|
5155
5297
|
|
5298
|
+
// positions of the tokens in the KV cache
|
5299
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5301
|
+
|
5156
5302
|
// shift the entire K-cache if needed
|
5157
5303
|
if (do_rope_shift) {
|
5158
5304
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
@@ -5201,12 +5347,9 @@ struct llm_build_context {
|
|
5201
5347
|
cb(Kcur, "Kcur", il);
|
5202
5348
|
|
5203
5349
|
|
5204
|
-
// apply ALiBi for 13B model
|
5205
|
-
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
5206
|
-
|
5207
5350
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5208
5351
|
model.layers[il].wo, NULL,
|
5209
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5352
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5210
5353
|
cb(cur, "kqv_out", il);
|
5211
5354
|
}
|
5212
5355
|
|
@@ -5330,7 +5473,7 @@ struct llm_build_context {
|
|
5330
5473
|
|
5331
5474
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5332
5475
|
model.layers[il].wo, NULL,
|
5333
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5476
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5334
5477
|
cb(cur, "kqv_out", il);
|
5335
5478
|
}
|
5336
5479
|
|
@@ -5429,7 +5572,7 @@ struct llm_build_context {
|
|
5429
5572
|
|
5430
5573
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5431
5574
|
model.layers[il].wo, model.layers[il].bo,
|
5432
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5575
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5433
5576
|
cb(cur, "kqv_out", il);
|
5434
5577
|
}
|
5435
5578
|
|
@@ -5634,7 +5777,7 @@ struct llm_build_context {
|
|
5634
5777
|
|
5635
5778
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5636
5779
|
model.layers[il].wo, model.layers[il].bo,
|
5637
|
-
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5780
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5638
5781
|
cb(cur, "kqv_out", il);
|
5639
5782
|
}
|
5640
5783
|
|
@@ -5696,6 +5839,10 @@ struct llm_build_context {
|
|
5696
5839
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5697
5840
|
cb(KQ_mask, "KQ_mask", -1);
|
5698
5841
|
|
5842
|
+
// positions of the tokens in the KV cache
|
5843
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5844
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5845
|
+
|
5699
5846
|
for (int il = 0; il < n_layer; ++il) {
|
5700
5847
|
struct ggml_tensor * inpSA = inpL;
|
5701
5848
|
|
@@ -5723,7 +5870,7 @@ struct llm_build_context {
|
|
5723
5870
|
|
5724
5871
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5725
5872
|
model.layers[il].wo, NULL,
|
5726
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5873
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5727
5874
|
cb(cur, "kqv_out", il);
|
5728
5875
|
}
|
5729
5876
|
|
@@ -5773,6 +5920,7 @@ struct llm_build_context {
|
|
5773
5920
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5774
5921
|
|
5775
5922
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5923
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5776
5924
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5777
5925
|
|
5778
5926
|
struct ggml_tensor * cur;
|
@@ -5781,7 +5929,8 @@ struct llm_build_context {
|
|
5781
5929
|
// get input vectors with right size
|
5782
5930
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5783
5931
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5784
|
-
struct ggml_tensor *
|
5932
|
+
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5933
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
5785
5934
|
|
5786
5935
|
// construct input embeddings (token, type, position)
|
5787
5936
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
@@ -5789,7 +5938,9 @@ struct llm_build_context {
|
|
5789
5938
|
// token types are hardcoded to zero ("Sentence A")
|
5790
5939
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
5791
5940
|
inpL = ggml_add(ctx0, inpL, type_row0);
|
5792
|
-
|
5941
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5942
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
5943
|
+
}
|
5793
5944
|
cb(inpL, "inp_embd", -1);
|
5794
5945
|
|
5795
5946
|
// embed layer norm
|
@@ -5805,7 +5956,7 @@ struct llm_build_context {
|
|
5805
5956
|
struct ggml_tensor * cur = inpL;
|
5806
5957
|
|
5807
5958
|
// self-attention
|
5808
|
-
{
|
5959
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5809
5960
|
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5810
5961
|
cb(Qcur, "Qcur", il);
|
5811
5962
|
|
@@ -5820,7 +5971,38 @@ struct llm_build_context {
|
|
5820
5971
|
|
5821
5972
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5822
5973
|
model.layers[il].wo, model.layers[il].bo,
|
5823
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5974
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5975
|
+
cb(cur, "kqv_out", il);
|
5976
|
+
} else {
|
5977
|
+
// compute Q and K and RoPE them
|
5978
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5979
|
+
cb(cur, "wqkv", il);
|
5980
|
+
|
5981
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5982
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5983
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5984
|
+
|
5985
|
+
cb(Qcur, "Qcur", il);
|
5986
|
+
cb(Kcur, "Kcur", il);
|
5987
|
+
cb(Vcur, "Vcur", il);
|
5988
|
+
|
5989
|
+
Qcur = ggml_rope_custom(
|
5990
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
|
+
);
|
5994
|
+
cb(Qcur, "Qcur", il);
|
5995
|
+
|
5996
|
+
Kcur = ggml_rope_custom(
|
5997
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
|
+
);
|
6001
|
+
cb(Kcur, "Kcur", il);
|
6002
|
+
|
6003
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6004
|
+
model.layers[il].wo, model.layers[il].bo,
|
6005
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5824
6006
|
cb(cur, "kqv_out", il);
|
5825
6007
|
}
|
5826
6008
|
|
@@ -5828,25 +6010,34 @@ struct llm_build_context {
|
|
5828
6010
|
cur = ggml_add(ctx0, cur, inpL);
|
5829
6011
|
|
5830
6012
|
// attention layer norm
|
5831
|
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
6013
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
5832
6014
|
|
5833
6015
|
struct ggml_tensor * ffn_inp = cur;
|
5834
6016
|
cb(ffn_inp, "ffn_inp", il);
|
5835
6017
|
|
5836
6018
|
// feed-forward network
|
5837
|
-
|
5838
|
-
|
5839
|
-
|
5840
|
-
|
5841
|
-
|
5842
|
-
|
6019
|
+
if (model.arch == LLM_ARCH_BERT) {
|
6020
|
+
cur = llm_build_ffn(ctx0, cur,
|
6021
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6022
|
+
NULL, NULL,
|
6023
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6024
|
+
NULL,
|
6025
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6026
|
+
} else {
|
6027
|
+
cur = llm_build_ffn(ctx0, cur,
|
6028
|
+
model.layers[il].ffn_up, NULL,
|
6029
|
+
model.layers[il].ffn_gate, NULL,
|
6030
|
+
model.layers[il].ffn_down, NULL,
|
6031
|
+
NULL,
|
6032
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6033
|
+
}
|
5843
6034
|
cb(cur, "ffn_out", il);
|
5844
6035
|
|
5845
6036
|
// attentions bypass the intermediate layer
|
5846
6037
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
5847
6038
|
|
5848
6039
|
// output layer norm
|
5849
|
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
6040
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
5850
6041
|
|
5851
6042
|
// input for next layer
|
5852
6043
|
inpL = cur;
|
@@ -5856,8 +6047,12 @@ struct llm_build_context {
|
|
5856
6047
|
cur = inpL;
|
5857
6048
|
|
5858
6049
|
// pooling layer
|
5859
|
-
if (
|
5860
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)),
|
6050
|
+
if (pooling_type == LLAMA_POOLING_MEAN) {
|
6051
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6052
|
+
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
6053
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6054
|
+
} else {
|
6055
|
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
5861
6056
|
}
|
5862
6057
|
cb(cur, "result_embd", -1);
|
5863
6058
|
|
@@ -5883,6 +6078,10 @@ struct llm_build_context {
|
|
5883
6078
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5884
6079
|
cb(KQ_mask, "KQ_mask", -1);
|
5885
6080
|
|
6081
|
+
// positions of the tokens in the KV cache
|
6082
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
6083
|
+
cb(KQ_pos, "KQ_pos", -1);
|
6084
|
+
|
5886
6085
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
5887
6086
|
model.tok_norm,
|
5888
6087
|
model.tok_norm_b,
|
@@ -5916,7 +6115,7 @@ struct llm_build_context {
|
|
5916
6115
|
|
5917
6116
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5918
6117
|
model.layers[il].wo, model.layers[il].bo,
|
5919
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6118
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5920
6119
|
cb(cur, "kqv_out", il);
|
5921
6120
|
}
|
5922
6121
|
|
@@ -5976,12 +6175,16 @@ struct llm_build_context {
|
|
5976
6175
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5977
6176
|
cb(KQ_mask, "KQ_mask", -1);
|
5978
6177
|
|
6178
|
+
// positions of the tokens in the KV cache
|
6179
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
6180
|
+
cb(KQ_pos, "KQ_pos", -1);
|
6181
|
+
|
5979
6182
|
for (int il = 0; il < n_layer; ++il) {
|
5980
6183
|
struct ggml_tensor * attn_norm;
|
5981
6184
|
|
5982
6185
|
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
5983
6186
|
model.layers[il].attn_norm,
|
5984
|
-
|
6187
|
+
model.layers[il].attn_norm_b,
|
5985
6188
|
LLM_NORM, cb, il);
|
5986
6189
|
cb(attn_norm, "attn_norm", il);
|
5987
6190
|
|
@@ -5992,6 +6195,11 @@ struct llm_build_context {
|
|
5992
6195
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5993
6196
|
cb(cur, "wqkv", il);
|
5994
6197
|
|
6198
|
+
if (model.layers[il].bqkv){
|
6199
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
6200
|
+
cb(cur, "bqkv", il);
|
6201
|
+
}
|
6202
|
+
|
5995
6203
|
if (hparams.f_clamp_kqv > 0.0f) {
|
5996
6204
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
5997
6205
|
cb(cur, "wqkv_clamped", il);
|
@@ -6008,8 +6216,8 @@ struct llm_build_context {
|
|
6008
6216
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6009
6217
|
|
6010
6218
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6011
|
-
model.layers[il].wo,
|
6012
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6219
|
+
model.layers[il].wo, model.layers[il].bo,
|
6220
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6013
6221
|
cb(cur, "kqv_out", il);
|
6014
6222
|
}
|
6015
6223
|
|
@@ -6021,13 +6229,13 @@ struct llm_build_context {
|
|
6021
6229
|
{
|
6022
6230
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6023
6231
|
model.layers[il].ffn_norm,
|
6024
|
-
|
6232
|
+
model.layers[il].ffn_norm_b,
|
6025
6233
|
LLM_NORM, cb, il);
|
6026
6234
|
cb(cur, "ffn_norm", il);
|
6027
6235
|
cur = llm_build_ffn(ctx0, cur,
|
6028
|
-
model.layers[il].ffn_up,
|
6236
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6029
6237
|
NULL, NULL,
|
6030
|
-
model.layers[il].ffn_down,
|
6238
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6031
6239
|
model.layers[il].ffn_act,
|
6032
6240
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6033
6241
|
cb(cur, "ffn_out", il);
|
@@ -6044,7 +6252,7 @@ struct llm_build_context {
|
|
6044
6252
|
|
6045
6253
|
cur = llm_build_norm(ctx0, cur, hparams,
|
6046
6254
|
model.output_norm,
|
6047
|
-
|
6255
|
+
model.output_norm_b,
|
6048
6256
|
LLM_NORM, cb, -1);
|
6049
6257
|
cb(cur, "result_norm", -1);
|
6050
6258
|
|
@@ -6131,7 +6339,7 @@ struct llm_build_context {
|
|
6131
6339
|
|
6132
6340
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6133
6341
|
model.layers[il].wo, NULL,
|
6134
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6342
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6135
6343
|
cb(cur, "kqv_out", il);
|
6136
6344
|
}
|
6137
6345
|
|
@@ -6246,7 +6454,7 @@ struct llm_build_context {
|
|
6246
6454
|
|
6247
6455
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6248
6456
|
model.layers[il].wo, NULL,
|
6249
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6457
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6250
6458
|
cb(cur, "kqv_out", il);
|
6251
6459
|
}
|
6252
6460
|
|
@@ -6367,7 +6575,7 @@ struct llm_build_context {
|
|
6367
6575
|
|
6368
6576
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6369
6577
|
model.layers[il].wo, model.layers[il].bo,
|
6370
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6578
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6371
6579
|
cb(cur, "kqv_out", il);
|
6372
6580
|
}
|
6373
6581
|
|
@@ -6494,7 +6702,7 @@ struct llm_build_context {
|
|
6494
6702
|
|
6495
6703
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6496
6704
|
model.layers[il].wo, model.layers[il].bo,
|
6497
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6705
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6498
6706
|
cb(cur, "kqv_out", il);
|
6499
6707
|
}
|
6500
6708
|
|
@@ -6597,7 +6805,7 @@ struct llm_build_context {
|
|
6597
6805
|
|
6598
6806
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6599
6807
|
model.layers[il].wo, NULL,
|
6600
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6601
6809
|
cb(cur, "kqv_out", il);
|
6602
6810
|
}
|
6603
6811
|
struct ggml_tensor * sa_out = cur;
|
@@ -6696,7 +6904,7 @@ struct llm_build_context {
|
|
6696
6904
|
|
6697
6905
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6698
6906
|
model.layers[il].wo, model.layers[il].bo,
|
6699
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6907
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6700
6908
|
cb(cur, "kqv_out", il);
|
6701
6909
|
}
|
6702
6910
|
|
@@ -6805,7 +7013,7 @@ struct llm_build_context {
|
|
6805
7013
|
|
6806
7014
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6807
7015
|
model.layers[il].wo, model.layers[il].bo,
|
6808
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7016
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6809
7017
|
cb(cur, "kqv_out", il);
|
6810
7018
|
}
|
6811
7019
|
|
@@ -6923,7 +7131,7 @@ struct llm_build_context {
|
|
6923
7131
|
|
6924
7132
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6925
7133
|
model.layers[il].wo, NULL,
|
6926
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7134
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6927
7135
|
cb(cur, "kqv_out", il);
|
6928
7136
|
}
|
6929
7137
|
|
@@ -7042,7 +7250,7 @@ struct llm_build_context {
|
|
7042
7250
|
|
7043
7251
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7044
7252
|
model.layers[il].wo, model.layers[il].bo,
|
7045
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7253
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7046
7254
|
cb(cur, "kqv_out", il);
|
7047
7255
|
}
|
7048
7256
|
|
@@ -7174,7 +7382,7 @@ struct llm_build_context {
|
|
7174
7382
|
|
7175
7383
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7176
7384
|
model.layers[il].wo, model.layers[il].bo,
|
7177
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7385
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7178
7386
|
cb(cur, "kqv_out", il);
|
7179
7387
|
}
|
7180
7388
|
|
@@ -7233,6 +7441,116 @@ struct llm_build_context {
|
|
7233
7441
|
|
7234
7442
|
return gf;
|
7235
7443
|
}
|
7444
|
+
|
7445
|
+
struct ggml_cgraph * build_gemma() {
|
7446
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7447
|
+
|
7448
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
7449
|
+
|
7450
|
+
struct ggml_tensor * cur;
|
7451
|
+
struct ggml_tensor * inpL;
|
7452
|
+
|
7453
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
7454
|
+
cb(inpL, "inp_embd", -1);
|
7455
|
+
|
7456
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
7457
|
+
cb(inpL, "inp_scaled", -1);
|
7458
|
+
|
7459
|
+
// inp_pos - contains the positions
|
7460
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
7461
|
+
cb(inp_pos, "inp_pos", -1);
|
7462
|
+
|
7463
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7464
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
|
+
cb(KQ_mask, "KQ_mask", -1);
|
7466
|
+
|
7467
|
+
// shift the entire K-cache if needed
|
7468
|
+
if (do_rope_shift) {
|
7469
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
+
}
|
7471
|
+
|
7472
|
+
for (int il = 0; il < n_layer; ++il) {
|
7473
|
+
|
7474
|
+
// norm
|
7475
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7476
|
+
model.layers[il].attn_norm, NULL,
|
7477
|
+
LLM_NORM_RMS, cb, il);
|
7478
|
+
cb(cur, "attn_norm", il);
|
7479
|
+
|
7480
|
+
// self-attention
|
7481
|
+
{
|
7482
|
+
// compute Q and K and RoPE them
|
7483
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
7484
|
+
cb(Qcur, "Qcur", il);
|
7485
|
+
|
7486
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
7487
|
+
cb(Kcur, "Kcur", il);
|
7488
|
+
|
7489
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7490
|
+
cb(Vcur, "Vcur", il);
|
7491
|
+
|
7492
|
+
Qcur = ggml_rope_custom(
|
7493
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
|
+
cb(Qcur, "Qcur", il);
|
7497
|
+
|
7498
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
7499
|
+
cb(Qcur, "Qcur_scaled", il);
|
7500
|
+
|
7501
|
+
Kcur = ggml_rope_custom(
|
7502
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
|
+
cb(Kcur, "Kcur", il);
|
7506
|
+
|
7507
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
+
model.layers[il].wo, NULL,
|
7509
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7510
|
+
cb(cur, "kqv_out", il);
|
7511
|
+
}
|
7512
|
+
|
7513
|
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
7514
|
+
cb(sa_out, "sa_out", il);
|
7515
|
+
|
7516
|
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
7517
|
+
model.layers[il].ffn_norm, NULL,
|
7518
|
+
LLM_NORM_RMS, cb, il);
|
7519
|
+
cb(cur, "ffn_norm", il);
|
7520
|
+
|
7521
|
+
// feed-forward network
|
7522
|
+
{
|
7523
|
+
cur = llm_build_ffn(ctx0, cur,
|
7524
|
+
model.layers[il].ffn_up, NULL,
|
7525
|
+
model.layers[il].ffn_gate, NULL,
|
7526
|
+
model.layers[il].ffn_down, NULL,
|
7527
|
+
NULL,
|
7528
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
7529
|
+
cb(cur, "ffn_out", il);
|
7530
|
+
}
|
7531
|
+
|
7532
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
7533
|
+
cb(cur, "l_out", il);
|
7534
|
+
|
7535
|
+
// input for next layer
|
7536
|
+
inpL = cur;
|
7537
|
+
}
|
7538
|
+
|
7539
|
+
cur = inpL;
|
7540
|
+
|
7541
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7542
|
+
model.output_norm, NULL,
|
7543
|
+
LLM_NORM_RMS, cb, -1);
|
7544
|
+
cb(cur, "result_norm", -1);
|
7545
|
+
|
7546
|
+
// lm_head
|
7547
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7548
|
+
cb(cur, "result_output", -1);
|
7549
|
+
|
7550
|
+
ggml_build_forward_expand(gf, cur);
|
7551
|
+
|
7552
|
+
return gf;
|
7553
|
+
}
|
7236
7554
|
};
|
7237
7555
|
|
7238
7556
|
static struct ggml_cgraph * llama_build_graph(
|
@@ -7289,6 +7607,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7289
7607
|
result = llm.build_refact();
|
7290
7608
|
} break;
|
7291
7609
|
case LLM_ARCH_BERT:
|
7610
|
+
case LLM_ARCH_NOMIC_BERT:
|
7292
7611
|
{
|
7293
7612
|
result = llm.build_bert();
|
7294
7613
|
} break;
|
@@ -7340,6 +7659,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7340
7659
|
{
|
7341
7660
|
result = llm.build_minicpm();
|
7342
7661
|
} break;
|
7662
|
+
case LLM_ARCH_GEMMA:
|
7663
|
+
{
|
7664
|
+
result = llm.build_gemma();
|
7665
|
+
} break;
|
7343
7666
|
default:
|
7344
7667
|
GGML_ASSERT(false);
|
7345
7668
|
}
|
@@ -7404,12 +7727,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7404
7727
|
}
|
7405
7728
|
}
|
7406
7729
|
|
7407
|
-
{
|
7408
|
-
|
7409
|
-
|
7730
|
+
if (hparams.need_kq_pos) {
|
7731
|
+
const int64_t n_kv = kv_self.n;
|
7732
|
+
|
7733
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
7410
7734
|
|
7411
|
-
|
7412
|
-
|
7735
|
+
float * data = (float *) lctx.inp_KQ_pos->data;
|
7736
|
+
|
7737
|
+
for (int i = 0; i < n_kv; ++i) {
|
7738
|
+
data[i] = float(lctx.kv_self.cells[i].pos);
|
7413
7739
|
}
|
7414
7740
|
}
|
7415
7741
|
|
@@ -7425,17 +7751,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7425
7751
|
}
|
7426
7752
|
}
|
7427
7753
|
|
7428
|
-
if (
|
7754
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
7429
7755
|
const int64_t n_tokens = batch.n_tokens;
|
7430
7756
|
|
7431
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.
|
7432
|
-
float * data = (float *) lctx.
|
7757
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7758
|
+
float * data = (float *) lctx.inp_mean->data;
|
7433
7759
|
|
7434
|
-
memset(lctx.
|
7760
|
+
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7435
7761
|
|
7762
|
+
std::vector<uint64_t> sum(n_tokens, 0);
|
7436
7763
|
for (int i = 0; i < n_tokens; ++i) {
|
7437
7764
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7438
|
-
|
7765
|
+
sum[seq_id] += 1;
|
7766
|
+
}
|
7767
|
+
|
7768
|
+
std::vector<float> div(n_tokens, 0.0f);
|
7769
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7770
|
+
const uint64_t s = sum[i];
|
7771
|
+
if (s > 0) {
|
7772
|
+
div[i] = 1.0f/float(s);
|
7773
|
+
}
|
7774
|
+
}
|
7775
|
+
|
7776
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7777
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7778
|
+
data[seq_id*n_tokens + i] = div[seq_id];
|
7779
|
+
}
|
7780
|
+
}
|
7781
|
+
|
7782
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
7783
|
+
const int64_t n_tokens = batch.n_tokens;
|
7784
|
+
|
7785
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
7786
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
7787
|
+
|
7788
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7789
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7790
|
+
const llama_pos pos = batch.pos[i];
|
7791
|
+
if (pos == 0) {
|
7792
|
+
data[seq_id] = i;
|
7793
|
+
}
|
7439
7794
|
}
|
7440
7795
|
}
|
7441
7796
|
}
|
@@ -10145,25 +10500,28 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10145
10500
|
return std::make_pair(i_layer, n_layer);
|
10146
10501
|
};
|
10147
10502
|
|
10148
|
-
|
10503
|
+
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
10504
|
+
// with the quantization of the output tensor
|
10505
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
10506
|
+
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
10149
10507
|
int nx = tensor->ne[0];
|
10150
10508
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10151
10509
|
new_type = GGML_TYPE_Q8_0;
|
10152
10510
|
}
|
10153
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10511
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10154
10512
|
new_type = GGML_TYPE_Q5_K;
|
10155
10513
|
}
|
10156
10514
|
else if (new_type != GGML_TYPE_Q8_0) {
|
10157
10515
|
new_type = GGML_TYPE_Q6_K;
|
10158
10516
|
}
|
10159
10517
|
} else if (name == "token_embd.weight") {
|
10160
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10518
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10161
10519
|
new_type = GGML_TYPE_Q2_K;
|
10162
10520
|
}
|
10163
10521
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10164
10522
|
new_type = GGML_TYPE_Q4_K;
|
10165
10523
|
}
|
10166
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10524
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10167
10525
|
if (name.find("attn_v.weight") != std::string::npos) {
|
10168
10526
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
10169
10527
|
else new_type = GGML_TYPE_Q2_K;
|
@@ -10173,6 +10531,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10173
10531
|
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
10174
10532
|
++qs.i_ffn_down;
|
10175
10533
|
}
|
10534
|
+
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
10536
|
+
}
|
10176
10537
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
10177
10538
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
10178
10539
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
@@ -10187,6 +10548,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10187
10548
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
10188
10549
|
}
|
10189
10550
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
10552
|
+
new_type = GGML_TYPE_Q5_K;
|
10553
|
+
}
|
10190
10554
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
10191
10555
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
10192
10556
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
@@ -10239,6 +10603,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10239
10603
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10240
10604
|
}
|
10241
10605
|
}
|
10606
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
10608
|
+
}
|
10242
10609
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10243
10610
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
10244
10611
|
new_type = GGML_TYPE_Q5_K;
|
@@ -10255,7 +10622,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10255
10622
|
if (arch != LLM_ARCH_FALCON) {
|
10256
10623
|
if (qs.model.hparams.n_expert == 8) {
|
10257
10624
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10258
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
10625
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
10259
10626
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
10260
10627
|
new_type = GGML_TYPE_Q5_K;
|
10261
10628
|
}
|
@@ -10306,7 +10673,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10306
10673
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
10307
10674
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
10308
10675
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
10309
|
-
new_type == GGML_TYPE_IQ3_XXS) {
|
10676
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10310
10677
|
int nx = tensor->ne[0];
|
10311
10678
|
int ny = tensor->ne[1];
|
10312
10679
|
if (nx % QK_K != 0) {
|
@@ -10321,8 +10688,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10321
10688
|
case GGML_TYPE_IQ2_XXS:
|
10322
10689
|
case GGML_TYPE_IQ2_XS:
|
10323
10690
|
case GGML_TYPE_IQ3_XXS:
|
10324
|
-
case
|
10325
|
-
case
|
10691
|
+
case GGML_TYPE_IQ1_S:
|
10692
|
+
case GGML_TYPE_Q2_K:
|
10693
|
+
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
10326
10694
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
10327
10695
|
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
10328
10696
|
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
@@ -10363,6 +10731,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10363
10731
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10364
10732
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10365
10733
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
10366
10736
|
|
10367
10737
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
10368
10738
|
}
|
@@ -10536,6 +10906,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10536
10906
|
}
|
10537
10907
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10538
10908
|
new_type == GGML_TYPE_IQ2_XS ||
|
10909
|
+
new_type == GGML_TYPE_IQ1_S ||
|
10539
10910
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10540
10911
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
10541
10912
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -10770,7 +11141,7 @@ static int llama_apply_lora_from_file_internal(
|
|
10770
11141
|
{
|
10771
11142
|
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
10772
11143
|
__func__, ftype);
|
10773
|
-
return
|
11144
|
+
return 1;
|
10774
11145
|
}
|
10775
11146
|
}
|
10776
11147
|
|
@@ -11059,7 +11430,7 @@ bool llama_mlock_supported(void) {
|
|
11059
11430
|
return llama_supports_mlock();
|
11060
11431
|
}
|
11061
11432
|
|
11062
|
-
void llama_backend_init(
|
11433
|
+
void llama_backend_init(void) {
|
11063
11434
|
ggml_time_init();
|
11064
11435
|
|
11065
11436
|
// needed to initialize f16 tables
|
@@ -11069,15 +11440,17 @@ void llama_backend_init(bool numa) {
|
|
11069
11440
|
ggml_free(ctx);
|
11070
11441
|
}
|
11071
11442
|
|
11072
|
-
if (numa) {
|
11073
|
-
ggml_numa_init();
|
11074
|
-
}
|
11075
|
-
|
11076
11443
|
#ifdef GGML_USE_MPI
|
11077
11444
|
ggml_mpi_backend_init();
|
11078
11445
|
#endif
|
11079
11446
|
}
|
11080
11447
|
|
11448
|
+
void llama_numa_init(enum ggml_numa_strategy numa) {
|
11449
|
+
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
11450
|
+
ggml_numa_init(numa);
|
11451
|
+
}
|
11452
|
+
}
|
11453
|
+
|
11081
11454
|
void llama_backend_free(void) {
|
11082
11455
|
#ifdef GGML_USE_MPI
|
11083
11456
|
ggml_mpi_backend_free();
|
@@ -11309,7 +11682,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11309
11682
|
// graph inputs
|
11310
11683
|
{
|
11311
11684
|
ggml_init_params init_params = {
|
11312
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
11685
|
+
/* .mem_size */ ggml_tensor_overhead()*8,
|
11313
11686
|
/* .mem_buffer */ nullptr,
|
11314
11687
|
/* .no_alloc */ true,
|
11315
11688
|
};
|
@@ -11319,15 +11692,19 @@ struct llama_context * llama_new_context_with_model(
|
|
11319
11692
|
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
11320
11693
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
11321
11694
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
11695
|
+
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
11322
11696
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
11323
|
-
ctx->
|
11697
|
+
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
11698
|
+
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
11324
11699
|
|
11325
11700
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
11326
11701
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
11327
11702
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
11328
11703
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
11704
|
+
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
11329
11705
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11330
|
-
ggml_set_name(ctx->
|
11706
|
+
ggml_set_name(ctx->inp_mean, "inp_mean");
|
11707
|
+
ggml_set_name(ctx->inp_cls, "inp_cls");
|
11331
11708
|
|
11332
11709
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
11333
11710
|
|
@@ -11819,18 +12196,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
11819
12196
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
11820
12197
|
|
11821
12198
|
if (kv_buf_size) {
|
11822
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
11823
|
-
|
11824
12199
|
std::vector<uint8_t> tmp_buf;
|
11825
12200
|
for (int il = 0; il < (int) n_layer; ++il) {
|
11826
|
-
|
12201
|
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12202
|
+
tmp_buf.resize(k_size);
|
11827
12203
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
11828
12204
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
11829
12205
|
|
11830
12206
|
// v is not contiguous, copy row by row
|
11831
|
-
|
12207
|
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12208
|
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12209
|
+
tmp_buf.resize(v_row_size);
|
11832
12210
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
11833
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*
|
12211
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
11834
12212
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
11835
12213
|
}
|
11836
12214
|
}
|
@@ -11932,17 +12310,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
11932
12310
|
if (kv_buf_size) {
|
11933
12311
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
11934
12312
|
|
11935
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
11936
|
-
|
11937
12313
|
for (int il = 0; il < (int) n_layer; ++il) {
|
11938
|
-
size_t k_size =
|
12314
|
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
11939
12315
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
11940
12316
|
inp += k_size;
|
11941
12317
|
|
11942
12318
|
// v is not contiguous, copy row by row
|
11943
|
-
size_t v_row_size =
|
12319
|
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12320
|
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
11944
12321
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
11945
|
-
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*
|
12322
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
11946
12323
|
inp += v_row_size;
|
11947
12324
|
}
|
11948
12325
|
}
|
@@ -12332,6 +12709,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
12332
12709
|
return 0;
|
12333
12710
|
}
|
12334
12711
|
|
12712
|
+
// trim whitespace from the beginning and end of a string
|
12713
|
+
static std::string trim(const std::string & str) {
|
12714
|
+
size_t start = 0;
|
12715
|
+
size_t end = str.size();
|
12716
|
+
while (start < end && isspace(str[start])) {
|
12717
|
+
start += 1;
|
12718
|
+
}
|
12719
|
+
while (end > start && isspace(str[end - 1])) {
|
12720
|
+
end -= 1;
|
12721
|
+
}
|
12722
|
+
return str.substr(start, end - start);
|
12723
|
+
}
|
12724
|
+
|
12725
|
+
// Simple version of "llama_apply_chat_template" that only works with strings
|
12726
|
+
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
12727
|
+
static int32_t llama_chat_apply_template_internal(
|
12728
|
+
const std::string & tmpl,
|
12729
|
+
const std::vector<const llama_chat_message *> & chat,
|
12730
|
+
std::string & dest, bool add_ass) {
|
12731
|
+
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
12732
|
+
std::stringstream ss;
|
12733
|
+
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
12734
|
+
// chatml template
|
12735
|
+
for (auto message : chat) {
|
12736
|
+
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
12737
|
+
}
|
12738
|
+
if (add_ass) {
|
12739
|
+
ss << "<|im_start|>assistant\n";
|
12740
|
+
}
|
12741
|
+
} else if (tmpl.find("[INST]") != std::string::npos) {
|
12742
|
+
// llama2 template and its variants
|
12743
|
+
// [variant] support system message
|
12744
|
+
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
12745
|
+
// [variant] space before + after response
|
12746
|
+
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
12747
|
+
// [variant] add BOS inside history
|
12748
|
+
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
12749
|
+
// [variant] trim spaces from the input message
|
12750
|
+
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
12751
|
+
// construct the prompt
|
12752
|
+
bool is_inside_turn = true; // skip BOS at the beginning
|
12753
|
+
ss << "[INST] ";
|
12754
|
+
for (auto message : chat) {
|
12755
|
+
std::string content = strip_message ? trim(message->content) : message->content;
|
12756
|
+
std::string role(message->role);
|
12757
|
+
if (!is_inside_turn) {
|
12758
|
+
is_inside_turn = true;
|
12759
|
+
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
12760
|
+
}
|
12761
|
+
if (role == "system") {
|
12762
|
+
if (support_system_message) {
|
12763
|
+
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
12764
|
+
} else {
|
12765
|
+
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
12766
|
+
ss << content << "\n";
|
12767
|
+
}
|
12768
|
+
} else if (role == "user") {
|
12769
|
+
ss << content << " [/INST]";
|
12770
|
+
} else {
|
12771
|
+
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
12772
|
+
is_inside_turn = false;
|
12773
|
+
}
|
12774
|
+
}
|
12775
|
+
// llama2 templates seem to not care about "add_generation_prompt"
|
12776
|
+
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
12777
|
+
// zephyr template
|
12778
|
+
for (auto message : chat) {
|
12779
|
+
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
12780
|
+
}
|
12781
|
+
if (add_ass) {
|
12782
|
+
ss << "<|assistant|>\n";
|
12783
|
+
}
|
12784
|
+
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
12785
|
+
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
12786
|
+
for (auto message : chat) {
|
12787
|
+
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
12788
|
+
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
12789
|
+
}
|
12790
|
+
if (add_ass) {
|
12791
|
+
ss << "<s>assistant\n";
|
12792
|
+
}
|
12793
|
+
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
12794
|
+
// google/gemma-7b-it
|
12795
|
+
std::string system_prompt = "";
|
12796
|
+
for (auto message : chat) {
|
12797
|
+
std::string role(message->role);
|
12798
|
+
if (role == "system") {
|
12799
|
+
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
12800
|
+
system_prompt = trim(message->content);
|
12801
|
+
continue;
|
12802
|
+
}
|
12803
|
+
// in gemma, "assistant" is "model"
|
12804
|
+
role = role == "assistant" ? "model" : message->role;
|
12805
|
+
ss << "<start_of_turn>" << role << "\n";
|
12806
|
+
if (!system_prompt.empty() && role != "model") {
|
12807
|
+
ss << system_prompt << "\n\n";
|
12808
|
+
system_prompt = "";
|
12809
|
+
}
|
12810
|
+
ss << trim(message->content) << "<end_of_turn>\n";
|
12811
|
+
}
|
12812
|
+
if (add_ass) {
|
12813
|
+
ss << "<start_of_turn>model\n";
|
12814
|
+
}
|
12815
|
+
} else {
|
12816
|
+
// template not supported
|
12817
|
+
return -1;
|
12818
|
+
}
|
12819
|
+
dest = ss.str();
|
12820
|
+
return dest.size();
|
12821
|
+
}
|
12822
|
+
|
12823
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
12824
|
+
const struct llama_model * model,
|
12825
|
+
const char * tmpl,
|
12826
|
+
const struct llama_chat_message * chat,
|
12827
|
+
size_t n_msg,
|
12828
|
+
bool add_ass,
|
12829
|
+
char * buf,
|
12830
|
+
int32_t length) {
|
12831
|
+
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
12832
|
+
if (tmpl == nullptr) {
|
12833
|
+
GGML_ASSERT(model != nullptr);
|
12834
|
+
// load template from model
|
12835
|
+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
12836
|
+
std::string template_key = "tokenizer.chat_template";
|
12837
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
12838
|
+
if (res < 0) {
|
12839
|
+
// worst case: there is no information about template, we will use chatml by default
|
12840
|
+
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
12841
|
+
} else {
|
12842
|
+
curr_tmpl = std::string(model_template.data(), model_template.size());
|
12843
|
+
}
|
12844
|
+
}
|
12845
|
+
// format the chat to string
|
12846
|
+
std::vector<const llama_chat_message *> chat_vec;
|
12847
|
+
chat_vec.resize(n_msg);
|
12848
|
+
for (size_t i = 0; i < n_msg; i++) {
|
12849
|
+
chat_vec[i] = &chat[i];
|
12850
|
+
}
|
12851
|
+
std::string formatted_chat;
|
12852
|
+
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
12853
|
+
if (res < 0) {
|
12854
|
+
return res;
|
12855
|
+
}
|
12856
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
12857
|
+
return res;
|
12858
|
+
}
|
12859
|
+
|
12335
12860
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
12336
12861
|
struct llama_timings result = {
|
12337
12862
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|