llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -197,6 +197,7 @@ enum llm_arch {
|
|
197
197
|
LLM_ARCH_PERSIMMON,
|
198
198
|
LLM_ARCH_REFACT,
|
199
199
|
LLM_ARCH_BERT,
|
200
|
+
LLM_ARCH_NOMIC_BERT,
|
200
201
|
LLM_ARCH_BLOOM,
|
201
202
|
LLM_ARCH_STABLELM,
|
202
203
|
LLM_ARCH_QWEN,
|
@@ -207,31 +208,34 @@ enum llm_arch {
|
|
207
208
|
LLM_ARCH_ORION,
|
208
209
|
LLM_ARCH_INTERNLM2,
|
209
210
|
LLM_ARCH_MINICPM,
|
211
|
+
LLM_ARCH_GEMMA,
|
210
212
|
LLM_ARCH_UNKNOWN,
|
211
213
|
};
|
212
214
|
|
213
215
|
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
214
|
-
{ LLM_ARCH_LLAMA, "llama"
|
215
|
-
{ LLM_ARCH_FALCON, "falcon"
|
216
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
217
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
218
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
219
|
-
{ LLM_ARCH_MPT, "mpt"
|
220
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
221
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
222
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
223
|
-
{ LLM_ARCH_REFACT, "refact"
|
224
|
-
{ LLM_ARCH_BERT, "bert"
|
225
|
-
{
|
226
|
-
{
|
227
|
-
{
|
228
|
-
{
|
229
|
-
{
|
230
|
-
{
|
231
|
-
{
|
232
|
-
{
|
233
|
-
{
|
234
|
-
{
|
216
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
217
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
218
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
219
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
220
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
221
|
+
{ LLM_ARCH_MPT, "mpt" },
|
222
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
223
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
224
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
225
|
+
{ LLM_ARCH_REFACT, "refact" },
|
226
|
+
{ LLM_ARCH_BERT, "bert" },
|
227
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
228
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
229
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
230
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
231
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
232
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
233
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
234
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
235
|
+
{ LLM_ARCH_ORION, "orion" },
|
236
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
237
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
238
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
235
239
|
};
|
236
240
|
|
237
241
|
enum llm_kv {
|
@@ -254,7 +258,7 @@ enum llm_kv {
|
|
254
258
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
255
259
|
LLM_KV_EXPERT_COUNT,
|
256
260
|
LLM_KV_EXPERT_USED_COUNT,
|
257
|
-
|
261
|
+
LLM_KV_POOLING_TYPE,
|
258
262
|
|
259
263
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
260
264
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -312,7 +316,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
312
316
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
313
317
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
314
318
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
315
|
-
{
|
319
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
316
320
|
|
317
321
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
318
322
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -375,6 +379,7 @@ enum llm_tensor {
|
|
375
379
|
LLM_TENSOR_ATTN_OUT,
|
376
380
|
LLM_TENSOR_ATTN_NORM,
|
377
381
|
LLM_TENSOR_ATTN_NORM_2,
|
382
|
+
LLM_TENSOR_ATTN_OUT_NORM,
|
378
383
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
379
384
|
LLM_TENSOR_FFN_GATE_INP,
|
380
385
|
LLM_TENSOR_FFN_NORM,
|
@@ -387,6 +392,7 @@ enum llm_tensor {
|
|
387
392
|
LLM_TENSOR_FFN_UP_EXP,
|
388
393
|
LLM_TENSOR_ATTN_Q_NORM,
|
389
394
|
LLM_TENSOR_ATTN_K_NORM,
|
395
|
+
LLM_TENSOR_LAYER_OUT_NORM,
|
390
396
|
};
|
391
397
|
|
392
398
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -503,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
503
509
|
{
|
504
510
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
505
511
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
506
|
-
{ LLM_TENSOR_OUTPUT, "output" },
|
507
512
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
508
513
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
509
514
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
@@ -552,12 +557,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
552
557
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
553
558
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
554
559
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
555
|
-
{
|
560
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
556
561
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
557
562
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
558
563
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
559
564
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
560
|
-
{
|
565
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
566
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
567
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
|
+
},
|
569
|
+
},
|
570
|
+
{
|
571
|
+
LLM_ARCH_NOMIC_BERT,
|
572
|
+
{
|
573
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
574
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
575
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
576
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
577
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
578
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
579
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
580
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
561
581
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
562
582
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
563
583
|
},
|
@@ -741,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
741
761
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
742
762
|
},
|
743
763
|
},
|
764
|
+
{
|
765
|
+
LLM_ARCH_GEMMA,
|
766
|
+
{
|
767
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
768
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
769
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
770
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
771
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
772
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
773
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
774
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
777
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
778
|
+
},
|
779
|
+
},
|
744
780
|
{
|
745
781
|
LLM_ARCH_UNKNOWN,
|
746
782
|
{
|
@@ -1015,7 +1051,7 @@ struct llama_mmap {
|
|
1015
1051
|
int fd = fileno(file->fp);
|
1016
1052
|
int flags = MAP_SHARED;
|
1017
1053
|
// prefetch/readahead impairs performance on NUMA systems
|
1018
|
-
if (numa)
|
1054
|
+
if (numa) { prefetch = 0; }
|
1019
1055
|
#ifdef __linux__
|
1020
1056
|
// advise the kernel to read the file sequentially (increases readahead)
|
1021
1057
|
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
@@ -1485,6 +1521,7 @@ enum e_model {
|
|
1485
1521
|
MODEL_22M,
|
1486
1522
|
MODEL_33M,
|
1487
1523
|
MODEL_109M,
|
1524
|
+
MODEL_137M,
|
1488
1525
|
MODEL_335M,
|
1489
1526
|
MODEL_0_5B,
|
1490
1527
|
MODEL_1B,
|
@@ -1537,12 +1574,13 @@ struct llama_hparams {
|
|
1537
1574
|
uint32_t n_yarn_orig_ctx;
|
1538
1575
|
int32_t rope_scaling_type_train;
|
1539
1576
|
|
1540
|
-
float f_clamp_kqv;
|
1541
|
-
float f_max_alibi_bias;
|
1577
|
+
float f_clamp_kqv = 0.0f;
|
1578
|
+
float f_max_alibi_bias = 0.0f;
|
1542
1579
|
|
1543
1580
|
bool causal_attn = true;
|
1544
|
-
bool
|
1581
|
+
bool need_kq_pos = false;
|
1545
1582
|
|
1583
|
+
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
1546
1584
|
|
1547
1585
|
bool operator!=(const llama_hparams & other) const {
|
1548
1586
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1620,6 +1658,8 @@ struct llama_layer {
|
|
1620
1658
|
struct ggml_tensor * attn_q_norm_b;
|
1621
1659
|
struct ggml_tensor * attn_k_norm;
|
1622
1660
|
struct ggml_tensor * attn_k_norm_b;
|
1661
|
+
struct ggml_tensor * attn_out_norm;
|
1662
|
+
struct ggml_tensor * attn_out_norm_b;
|
1623
1663
|
|
1624
1664
|
// attention
|
1625
1665
|
struct ggml_tensor * wq;
|
@@ -1638,6 +1678,8 @@ struct llama_layer {
|
|
1638
1678
|
// normalization
|
1639
1679
|
struct ggml_tensor * ffn_norm;
|
1640
1680
|
struct ggml_tensor * ffn_norm_b;
|
1681
|
+
struct ggml_tensor * layer_out_norm;
|
1682
|
+
struct ggml_tensor * layer_out_norm_b;
|
1641
1683
|
|
1642
1684
|
// ff
|
1643
1685
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1899,8 +1941,10 @@ struct llama_context {
|
|
1899
1941
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
1900
1942
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1901
1943
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1944
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
1902
1945
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1903
|
-
struct ggml_tensor *
|
1946
|
+
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
1947
|
+
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
1904
1948
|
|
1905
1949
|
#ifdef GGML_USE_MPI
|
1906
1950
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2499,6 +2543,8 @@ struct llama_model_loader {
|
|
2499
2543
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2500
2544
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2501
2545
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
|
+
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
|
+
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2502
2548
|
default:
|
2503
2549
|
{
|
2504
2550
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2744,13 +2790,7 @@ struct llama_model_loader {
|
|
2744
2790
|
|
2745
2791
|
std::vector<no_init<uint8_t>> read_buf;
|
2746
2792
|
|
2747
|
-
for (
|
2748
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2749
|
-
if (!cur) {
|
2750
|
-
// some tensors may be allocated in a different context
|
2751
|
-
continue;
|
2752
|
-
}
|
2753
|
-
|
2793
|
+
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
2754
2794
|
if (progress_callback) {
|
2755
2795
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
2756
2796
|
return false;
|
@@ -2848,6 +2888,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2848
2888
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2849
2889
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
2850
2890
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2851
2893
|
|
2852
2894
|
default: return "unknown, may not work";
|
2853
2895
|
}
|
@@ -2855,6 +2897,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2855
2897
|
|
2856
2898
|
static const char * llama_model_type_name(e_model type) {
|
2857
2899
|
switch (type) {
|
2900
|
+
case MODEL_22M: return "22M";
|
2901
|
+
case MODEL_33M: return "33M";
|
2902
|
+
case MODEL_109M: return "109M";
|
2903
|
+
case MODEL_137M: return "137M";
|
2904
|
+
case MODEL_0_5B: return "0.5B";
|
2858
2905
|
case MODEL_1B: return "1B";
|
2859
2906
|
case MODEL_2B: return "2B";
|
2860
2907
|
case MODEL_3B: return "3B";
|
@@ -3024,6 +3071,11 @@ static void llm_load_hparams(
|
|
3024
3071
|
case 40: model.type = e_model::MODEL_13B; break;
|
3025
3072
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3026
3073
|
}
|
3074
|
+
|
3075
|
+
if (model.type == e_model::MODEL_13B) {
|
3076
|
+
// TODO: become GGUF KV parameter
|
3077
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3078
|
+
}
|
3027
3079
|
} break;
|
3028
3080
|
case LLM_ARCH_STARCODER:
|
3029
3081
|
{
|
@@ -3051,13 +3103,16 @@ static void llm_load_hparams(
|
|
3051
3103
|
case 32: model.type = e_model::MODEL_1B; break;
|
3052
3104
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3053
3105
|
}
|
3106
|
+
|
3107
|
+
// TODO: become GGUF KV parameter
|
3108
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3054
3109
|
} break;
|
3055
3110
|
case LLM_ARCH_BERT:
|
3056
3111
|
{
|
3057
3112
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3058
3113
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3059
3114
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3060
|
-
ml.get_key(
|
3115
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3061
3116
|
|
3062
3117
|
switch (hparams.n_layer) {
|
3063
3118
|
case 3:
|
@@ -3073,6 +3128,17 @@ static void llm_load_hparams(
|
|
3073
3128
|
model.type = e_model::MODEL_335M; break; // bge-large
|
3074
3129
|
}
|
3075
3130
|
} break;
|
3131
|
+
case LLM_ARCH_NOMIC_BERT:
|
3132
|
+
{
|
3133
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3134
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
|
+
|
3138
|
+
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
|
+
model.type = e_model::MODEL_137M;
|
3140
|
+
}
|
3141
|
+
} break;
|
3076
3142
|
case LLM_ARCH_BLOOM:
|
3077
3143
|
{
|
3078
3144
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3085,11 +3151,12 @@ static void llm_load_hparams(
|
|
3085
3151
|
case 4096: model.type = e_model::MODEL_7B; break;
|
3086
3152
|
} break;
|
3087
3153
|
}
|
3154
|
+
|
3155
|
+
// TODO: become GGUF KV parameter
|
3156
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3088
3157
|
} break;
|
3089
3158
|
case LLM_ARCH_MPT:
|
3090
3159
|
{
|
3091
|
-
hparams.f_clamp_kqv = 0.0f;
|
3092
|
-
|
3093
3160
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3094
3161
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
3095
3162
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
@@ -3187,10 +3254,24 @@ static void llm_load_hparams(
|
|
3187
3254
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3188
3255
|
}
|
3189
3256
|
} break;
|
3257
|
+
case LLM_ARCH_GEMMA:
|
3258
|
+
{
|
3259
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3260
|
+
|
3261
|
+
switch (hparams.n_layer) {
|
3262
|
+
case 18: model.type = e_model::MODEL_2B; break;
|
3263
|
+
case 28: model.type = e_model::MODEL_7B; break;
|
3264
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3265
|
+
}
|
3266
|
+
} break;
|
3190
3267
|
default: (void)0;
|
3191
3268
|
}
|
3192
3269
|
|
3193
3270
|
model.ftype = ml.ftype;
|
3271
|
+
|
3272
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
|
+
hparams.need_kq_pos = true;
|
3274
|
+
}
|
3194
3275
|
}
|
3195
3276
|
|
3196
3277
|
// TODO: This should probably be in llama.h
|
@@ -3634,7 +3715,7 @@ static bool llm_load_tensors(
|
|
3634
3715
|
}
|
3635
3716
|
|
3636
3717
|
// create one context per buffer type
|
3637
|
-
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
3718
|
+
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
3638
3719
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
3639
3720
|
for (auto & it : buft_layer_count) {
|
3640
3721
|
struct ggml_init_params params = {
|
@@ -3772,6 +3853,7 @@ static bool llm_load_tensors(
|
|
3772
3853
|
} else {
|
3773
3854
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
3774
3855
|
ml.n_created--; // artificial tensor
|
3856
|
+
ml.size_data += ggml_nbytes(model.output);
|
3775
3857
|
}
|
3776
3858
|
}
|
3777
3859
|
|
@@ -3875,10 +3957,14 @@ static bool llm_load_tensors(
|
|
3875
3957
|
}
|
3876
3958
|
} break;
|
3877
3959
|
case LLM_ARCH_BERT:
|
3960
|
+
case LLM_ARCH_NOMIC_BERT:
|
3878
3961
|
{
|
3879
|
-
model.tok_embd
|
3880
|
-
model.type_embd
|
3881
|
-
model.
|
3962
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3963
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
3964
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3965
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3966
|
+
}
|
3967
|
+
|
3882
3968
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3883
3969
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3884
3970
|
|
@@ -3888,29 +3974,38 @@ static bool llm_load_tensors(
|
|
3888
3974
|
|
3889
3975
|
auto & layer = model.layers[i];
|
3890
3976
|
|
3891
|
-
|
3892
|
-
|
3977
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3978
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3979
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3893
3980
|
|
3894
|
-
|
3895
|
-
|
3981
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3982
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3896
3983
|
|
3897
|
-
|
3898
|
-
|
3984
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3985
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3986
|
+
} else {
|
3987
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3988
|
+
}
|
3899
3989
|
|
3900
|
-
layer.
|
3901
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3990
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3902
3991
|
|
3903
|
-
layer.
|
3904
|
-
layer.
|
3992
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
3993
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
3905
3994
|
|
3906
|
-
layer.
|
3907
|
-
layer.
|
3995
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3996
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3908
3997
|
|
3909
|
-
|
3910
|
-
|
3998
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3999
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4000
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3911
4001
|
|
3912
|
-
|
3913
|
-
|
4002
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4003
|
+
} else {
|
4004
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4005
|
+
}
|
4006
|
+
|
4007
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4008
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
3914
4009
|
}
|
3915
4010
|
} break;
|
3916
4011
|
case LLM_ARCH_BLOOM:
|
@@ -3958,7 +4053,12 @@ static bool llm_load_tensors(
|
|
3958
4053
|
// output
|
3959
4054
|
{
|
3960
4055
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3961
|
-
model.
|
4056
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
4057
|
+
|
4058
|
+
// same as tok_embd, duplicated to allow offloading
|
4059
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4060
|
+
ml.n_created--; // artificial tensor
|
4061
|
+
ml.size_data += ggml_nbytes(model.output);
|
3962
4062
|
}
|
3963
4063
|
|
3964
4064
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -3967,14 +4067,23 @@ static bool llm_load_tensors(
|
|
3967
4067
|
|
3968
4068
|
auto & layer = model.layers[i];
|
3969
4069
|
|
3970
|
-
layer.attn_norm
|
4070
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4071
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
3971
4072
|
|
3972
4073
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4074
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
4075
|
+
|
3973
4076
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4077
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
3974
4078
|
|
3975
|
-
layer.ffn_norm
|
3976
|
-
layer.
|
3977
|
-
|
4079
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4080
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4081
|
+
|
4082
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
4083
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
4084
|
+
|
4085
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4086
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
3978
4087
|
|
3979
4088
|
// AWQ ScaleActivation layer
|
3980
4089
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
@@ -4287,6 +4396,40 @@ static bool llm_load_tensors(
|
|
4287
4396
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4288
4397
|
}
|
4289
4398
|
} break;
|
4399
|
+
case LLM_ARCH_GEMMA:
|
4400
|
+
{
|
4401
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4402
|
+
|
4403
|
+
// output
|
4404
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4405
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
4406
|
+
ml.n_created--; // artificial tensor
|
4407
|
+
ml.size_data += ggml_nbytes(model.output);
|
4408
|
+
|
4409
|
+
const int64_t n_ff = hparams.n_ff;
|
4410
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4411
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4412
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4413
|
+
|
4414
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
4415
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4416
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4417
|
+
|
4418
|
+
auto & layer = model.layers[i];
|
4419
|
+
|
4420
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4421
|
+
|
4422
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
4423
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
4424
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
4425
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
4426
|
+
|
4427
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4428
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4429
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4430
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4431
|
+
}
|
4432
|
+
} break;
|
4290
4433
|
default:
|
4291
4434
|
throw std::runtime_error("unknown architecture");
|
4292
4435
|
}
|
@@ -4720,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4720
4863
|
struct ggml_tensor * wo_b,
|
4721
4864
|
struct ggml_tensor * q_cur,
|
4722
4865
|
struct ggml_tensor * kq_mask,
|
4866
|
+
struct ggml_tensor * kq_pos,
|
4723
4867
|
int64_t n_ctx,
|
4724
4868
|
int32_t n_tokens,
|
4725
4869
|
int32_t n_kv,
|
4726
|
-
float max_alibi_bias,
|
4727
4870
|
float kq_scale,
|
4728
4871
|
const llm_build_cb & cb,
|
4729
4872
|
int il) {
|
@@ -4753,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4753
4896
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4754
4897
|
}
|
4755
4898
|
|
4756
|
-
|
4757
|
-
|
4899
|
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
|
4900
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
|
4901
|
+
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
4758
4904
|
kq = ggml_scale(ctx, kq, kq_scale);
|
4759
4905
|
cb(kq, "kq_scaled", il);
|
4760
4906
|
|
4761
|
-
|
4762
|
-
|
4763
|
-
// TODO: K-shift is likely not working
|
4764
|
-
// TODO: change to ggml_add
|
4765
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
4766
|
-
cb(kq, "kq_scaled_alibi", il);
|
4767
|
-
}
|
4907
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
4908
|
+
cb(kq, "kq_scaled_alibi", il);
|
4768
4909
|
|
4769
4910
|
kq = ggml_add(ctx, kq, kq_mask);
|
4770
4911
|
cb(kq, "kq_masked", il);
|
4771
4912
|
|
4772
4913
|
kq = ggml_soft_max(ctx, kq);
|
4773
4914
|
cb(kq, "kq_soft_max", il);
|
4774
|
-
} else
|
4775
|
-
|
4915
|
+
} else
|
4916
|
+
#endif
|
4917
|
+
{
|
4918
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
4776
4919
|
cb(kq, "kq_soft_max_ext", il);
|
4777
4920
|
}
|
4778
4921
|
|
@@ -4820,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
|
|
4820
4963
|
struct ggml_tensor * v_cur,
|
4821
4964
|
struct ggml_tensor * q_cur,
|
4822
4965
|
struct ggml_tensor * kq_mask,
|
4966
|
+
struct ggml_tensor * kq_pos,
|
4823
4967
|
int64_t n_ctx,
|
4824
4968
|
int32_t n_tokens,
|
4825
4969
|
int32_t kv_head,
|
4826
4970
|
int32_t n_kv,
|
4827
|
-
float max_alibi_bias,
|
4828
4971
|
float kq_scale,
|
4829
4972
|
const llm_build_cb & cb,
|
4830
4973
|
int il) {
|
@@ -4838,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
|
|
4838
4981
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4839
4982
|
|
4840
4983
|
struct ggml_tensor * cur;
|
4841
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
4842
|
-
|
4843
|
-
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
4984
|
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4985
|
+
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4844
4986
|
cb(cur, "kqv_out", il);
|
4845
4987
|
|
4846
4988
|
return cur;
|
@@ -4881,7 +5023,7 @@ struct llm_build_context {
|
|
4881
5023
|
const int32_t n_orig_ctx;
|
4882
5024
|
|
4883
5025
|
const bool do_rope_shift;
|
4884
|
-
const
|
5026
|
+
const uint32_t pooling_type;
|
4885
5027
|
|
4886
5028
|
const llm_build_cb & cb;
|
4887
5029
|
|
@@ -4925,7 +5067,7 @@ struct llm_build_context {
|
|
4925
5067
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4926
5068
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4927
5069
|
do_rope_shift (worst_case || kv_self.has_shift),
|
4928
|
-
do_pooling
|
5070
|
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
4929
5071
|
cb (cb),
|
4930
5072
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4931
5073
|
// all initializations should be done in init()
|
@@ -5008,7 +5150,7 @@ struct llm_build_context {
|
|
5008
5150
|
}
|
5009
5151
|
|
5010
5152
|
Qcur = ggml_rope_custom(
|
5011
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
5153
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5012
5154
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
5013
5155
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5014
5156
|
);
|
@@ -5023,7 +5165,7 @@ struct llm_build_context {
|
|
5023
5165
|
|
5024
5166
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5025
5167
|
model.layers[il].wo, model.layers[il].bo,
|
5026
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5168
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5027
5169
|
cb(cur, "kqv_out", il);
|
5028
5170
|
}
|
5029
5171
|
|
@@ -5153,6 +5295,10 @@ struct llm_build_context {
|
|
5153
5295
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5154
5296
|
cb(KQ_mask, "KQ_mask", -1);
|
5155
5297
|
|
5298
|
+
// positions of the tokens in the KV cache
|
5299
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5301
|
+
|
5156
5302
|
// shift the entire K-cache if needed
|
5157
5303
|
if (do_rope_shift) {
|
5158
5304
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
@@ -5201,12 +5347,9 @@ struct llm_build_context {
|
|
5201
5347
|
cb(Kcur, "Kcur", il);
|
5202
5348
|
|
5203
5349
|
|
5204
|
-
// apply ALiBi for 13B model
|
5205
|
-
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
5206
|
-
|
5207
5350
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5208
5351
|
model.layers[il].wo, NULL,
|
5209
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5352
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5210
5353
|
cb(cur, "kqv_out", il);
|
5211
5354
|
}
|
5212
5355
|
|
@@ -5330,7 +5473,7 @@ struct llm_build_context {
|
|
5330
5473
|
|
5331
5474
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5332
5475
|
model.layers[il].wo, NULL,
|
5333
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5476
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5334
5477
|
cb(cur, "kqv_out", il);
|
5335
5478
|
}
|
5336
5479
|
|
@@ -5429,7 +5572,7 @@ struct llm_build_context {
|
|
5429
5572
|
|
5430
5573
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5431
5574
|
model.layers[il].wo, model.layers[il].bo,
|
5432
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5575
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5433
5576
|
cb(cur, "kqv_out", il);
|
5434
5577
|
}
|
5435
5578
|
|
@@ -5634,7 +5777,7 @@ struct llm_build_context {
|
|
5634
5777
|
|
5635
5778
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5636
5779
|
model.layers[il].wo, model.layers[il].bo,
|
5637
|
-
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5780
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5638
5781
|
cb(cur, "kqv_out", il);
|
5639
5782
|
}
|
5640
5783
|
|
@@ -5696,6 +5839,10 @@ struct llm_build_context {
|
|
5696
5839
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5697
5840
|
cb(KQ_mask, "KQ_mask", -1);
|
5698
5841
|
|
5842
|
+
// positions of the tokens in the KV cache
|
5843
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5844
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5845
|
+
|
5699
5846
|
for (int il = 0; il < n_layer; ++il) {
|
5700
5847
|
struct ggml_tensor * inpSA = inpL;
|
5701
5848
|
|
@@ -5723,7 +5870,7 @@ struct llm_build_context {
|
|
5723
5870
|
|
5724
5871
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5725
5872
|
model.layers[il].wo, NULL,
|
5726
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5873
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5727
5874
|
cb(cur, "kqv_out", il);
|
5728
5875
|
}
|
5729
5876
|
|
@@ -5773,6 +5920,7 @@ struct llm_build_context {
|
|
5773
5920
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5774
5921
|
|
5775
5922
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5923
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5776
5924
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5777
5925
|
|
5778
5926
|
struct ggml_tensor * cur;
|
@@ -5781,7 +5929,8 @@ struct llm_build_context {
|
|
5781
5929
|
// get input vectors with right size
|
5782
5930
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5783
5931
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5784
|
-
struct ggml_tensor *
|
5932
|
+
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5933
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
5785
5934
|
|
5786
5935
|
// construct input embeddings (token, type, position)
|
5787
5936
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
@@ -5789,7 +5938,9 @@ struct llm_build_context {
|
|
5789
5938
|
// token types are hardcoded to zero ("Sentence A")
|
5790
5939
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
5791
5940
|
inpL = ggml_add(ctx0, inpL, type_row0);
|
5792
|
-
|
5941
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5942
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
5943
|
+
}
|
5793
5944
|
cb(inpL, "inp_embd", -1);
|
5794
5945
|
|
5795
5946
|
// embed layer norm
|
@@ -5805,7 +5956,7 @@ struct llm_build_context {
|
|
5805
5956
|
struct ggml_tensor * cur = inpL;
|
5806
5957
|
|
5807
5958
|
// self-attention
|
5808
|
-
{
|
5959
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5809
5960
|
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5810
5961
|
cb(Qcur, "Qcur", il);
|
5811
5962
|
|
@@ -5820,7 +5971,38 @@ struct llm_build_context {
|
|
5820
5971
|
|
5821
5972
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5822
5973
|
model.layers[il].wo, model.layers[il].bo,
|
5823
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5974
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5975
|
+
cb(cur, "kqv_out", il);
|
5976
|
+
} else {
|
5977
|
+
// compute Q and K and RoPE them
|
5978
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5979
|
+
cb(cur, "wqkv", il);
|
5980
|
+
|
5981
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5982
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5983
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5984
|
+
|
5985
|
+
cb(Qcur, "Qcur", il);
|
5986
|
+
cb(Kcur, "Kcur", il);
|
5987
|
+
cb(Vcur, "Vcur", il);
|
5988
|
+
|
5989
|
+
Qcur = ggml_rope_custom(
|
5990
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
|
+
);
|
5994
|
+
cb(Qcur, "Qcur", il);
|
5995
|
+
|
5996
|
+
Kcur = ggml_rope_custom(
|
5997
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
|
+
);
|
6001
|
+
cb(Kcur, "Kcur", il);
|
6002
|
+
|
6003
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6004
|
+
model.layers[il].wo, model.layers[il].bo,
|
6005
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5824
6006
|
cb(cur, "kqv_out", il);
|
5825
6007
|
}
|
5826
6008
|
|
@@ -5828,25 +6010,34 @@ struct llm_build_context {
|
|
5828
6010
|
cur = ggml_add(ctx0, cur, inpL);
|
5829
6011
|
|
5830
6012
|
// attention layer norm
|
5831
|
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
6013
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
5832
6014
|
|
5833
6015
|
struct ggml_tensor * ffn_inp = cur;
|
5834
6016
|
cb(ffn_inp, "ffn_inp", il);
|
5835
6017
|
|
5836
6018
|
// feed-forward network
|
5837
|
-
|
5838
|
-
|
5839
|
-
|
5840
|
-
|
5841
|
-
|
5842
|
-
|
6019
|
+
if (model.arch == LLM_ARCH_BERT) {
|
6020
|
+
cur = llm_build_ffn(ctx0, cur,
|
6021
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6022
|
+
NULL, NULL,
|
6023
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6024
|
+
NULL,
|
6025
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6026
|
+
} else {
|
6027
|
+
cur = llm_build_ffn(ctx0, cur,
|
6028
|
+
model.layers[il].ffn_up, NULL,
|
6029
|
+
model.layers[il].ffn_gate, NULL,
|
6030
|
+
model.layers[il].ffn_down, NULL,
|
6031
|
+
NULL,
|
6032
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6033
|
+
}
|
5843
6034
|
cb(cur, "ffn_out", il);
|
5844
6035
|
|
5845
6036
|
// attentions bypass the intermediate layer
|
5846
6037
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
5847
6038
|
|
5848
6039
|
// output layer norm
|
5849
|
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
6040
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
5850
6041
|
|
5851
6042
|
// input for next layer
|
5852
6043
|
inpL = cur;
|
@@ -5856,8 +6047,12 @@ struct llm_build_context {
|
|
5856
6047
|
cur = inpL;
|
5857
6048
|
|
5858
6049
|
// pooling layer
|
5859
|
-
if (
|
5860
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)),
|
6050
|
+
if (pooling_type == LLAMA_POOLING_MEAN) {
|
6051
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6052
|
+
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
6053
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6054
|
+
} else {
|
6055
|
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
5861
6056
|
}
|
5862
6057
|
cb(cur, "result_embd", -1);
|
5863
6058
|
|
@@ -5883,6 +6078,10 @@ struct llm_build_context {
|
|
5883
6078
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5884
6079
|
cb(KQ_mask, "KQ_mask", -1);
|
5885
6080
|
|
6081
|
+
// positions of the tokens in the KV cache
|
6082
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
6083
|
+
cb(KQ_pos, "KQ_pos", -1);
|
6084
|
+
|
5886
6085
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
5887
6086
|
model.tok_norm,
|
5888
6087
|
model.tok_norm_b,
|
@@ -5916,7 +6115,7 @@ struct llm_build_context {
|
|
5916
6115
|
|
5917
6116
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5918
6117
|
model.layers[il].wo, model.layers[il].bo,
|
5919
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6118
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5920
6119
|
cb(cur, "kqv_out", il);
|
5921
6120
|
}
|
5922
6121
|
|
@@ -5976,12 +6175,16 @@ struct llm_build_context {
|
|
5976
6175
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5977
6176
|
cb(KQ_mask, "KQ_mask", -1);
|
5978
6177
|
|
6178
|
+
// positions of the tokens in the KV cache
|
6179
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
6180
|
+
cb(KQ_pos, "KQ_pos", -1);
|
6181
|
+
|
5979
6182
|
for (int il = 0; il < n_layer; ++il) {
|
5980
6183
|
struct ggml_tensor * attn_norm;
|
5981
6184
|
|
5982
6185
|
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
5983
6186
|
model.layers[il].attn_norm,
|
5984
|
-
|
6187
|
+
model.layers[il].attn_norm_b,
|
5985
6188
|
LLM_NORM, cb, il);
|
5986
6189
|
cb(attn_norm, "attn_norm", il);
|
5987
6190
|
|
@@ -5992,6 +6195,11 @@ struct llm_build_context {
|
|
5992
6195
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5993
6196
|
cb(cur, "wqkv", il);
|
5994
6197
|
|
6198
|
+
if (model.layers[il].bqkv){
|
6199
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
6200
|
+
cb(cur, "bqkv", il);
|
6201
|
+
}
|
6202
|
+
|
5995
6203
|
if (hparams.f_clamp_kqv > 0.0f) {
|
5996
6204
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
5997
6205
|
cb(cur, "wqkv_clamped", il);
|
@@ -6008,8 +6216,8 @@ struct llm_build_context {
|
|
6008
6216
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6009
6217
|
|
6010
6218
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6011
|
-
model.layers[il].wo,
|
6012
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6219
|
+
model.layers[il].wo, model.layers[il].bo,
|
6220
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6013
6221
|
cb(cur, "kqv_out", il);
|
6014
6222
|
}
|
6015
6223
|
|
@@ -6021,13 +6229,13 @@ struct llm_build_context {
|
|
6021
6229
|
{
|
6022
6230
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6023
6231
|
model.layers[il].ffn_norm,
|
6024
|
-
|
6232
|
+
model.layers[il].ffn_norm_b,
|
6025
6233
|
LLM_NORM, cb, il);
|
6026
6234
|
cb(cur, "ffn_norm", il);
|
6027
6235
|
cur = llm_build_ffn(ctx0, cur,
|
6028
|
-
model.layers[il].ffn_up,
|
6236
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6029
6237
|
NULL, NULL,
|
6030
|
-
model.layers[il].ffn_down,
|
6238
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6031
6239
|
model.layers[il].ffn_act,
|
6032
6240
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6033
6241
|
cb(cur, "ffn_out", il);
|
@@ -6044,7 +6252,7 @@ struct llm_build_context {
|
|
6044
6252
|
|
6045
6253
|
cur = llm_build_norm(ctx0, cur, hparams,
|
6046
6254
|
model.output_norm,
|
6047
|
-
|
6255
|
+
model.output_norm_b,
|
6048
6256
|
LLM_NORM, cb, -1);
|
6049
6257
|
cb(cur, "result_norm", -1);
|
6050
6258
|
|
@@ -6131,7 +6339,7 @@ struct llm_build_context {
|
|
6131
6339
|
|
6132
6340
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6133
6341
|
model.layers[il].wo, NULL,
|
6134
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6342
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6135
6343
|
cb(cur, "kqv_out", il);
|
6136
6344
|
}
|
6137
6345
|
|
@@ -6246,7 +6454,7 @@ struct llm_build_context {
|
|
6246
6454
|
|
6247
6455
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6248
6456
|
model.layers[il].wo, NULL,
|
6249
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6457
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6250
6458
|
cb(cur, "kqv_out", il);
|
6251
6459
|
}
|
6252
6460
|
|
@@ -6367,7 +6575,7 @@ struct llm_build_context {
|
|
6367
6575
|
|
6368
6576
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6369
6577
|
model.layers[il].wo, model.layers[il].bo,
|
6370
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6578
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6371
6579
|
cb(cur, "kqv_out", il);
|
6372
6580
|
}
|
6373
6581
|
|
@@ -6494,7 +6702,7 @@ struct llm_build_context {
|
|
6494
6702
|
|
6495
6703
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6496
6704
|
model.layers[il].wo, model.layers[il].bo,
|
6497
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6705
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6498
6706
|
cb(cur, "kqv_out", il);
|
6499
6707
|
}
|
6500
6708
|
|
@@ -6597,7 +6805,7 @@ struct llm_build_context {
|
|
6597
6805
|
|
6598
6806
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6599
6807
|
model.layers[il].wo, NULL,
|
6600
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6601
6809
|
cb(cur, "kqv_out", il);
|
6602
6810
|
}
|
6603
6811
|
struct ggml_tensor * sa_out = cur;
|
@@ -6696,7 +6904,7 @@ struct llm_build_context {
|
|
6696
6904
|
|
6697
6905
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6698
6906
|
model.layers[il].wo, model.layers[il].bo,
|
6699
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6907
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6700
6908
|
cb(cur, "kqv_out", il);
|
6701
6909
|
}
|
6702
6910
|
|
@@ -6805,7 +7013,7 @@ struct llm_build_context {
|
|
6805
7013
|
|
6806
7014
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6807
7015
|
model.layers[il].wo, model.layers[il].bo,
|
6808
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7016
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6809
7017
|
cb(cur, "kqv_out", il);
|
6810
7018
|
}
|
6811
7019
|
|
@@ -6923,7 +7131,7 @@ struct llm_build_context {
|
|
6923
7131
|
|
6924
7132
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6925
7133
|
model.layers[il].wo, NULL,
|
6926
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7134
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6927
7135
|
cb(cur, "kqv_out", il);
|
6928
7136
|
}
|
6929
7137
|
|
@@ -7042,7 +7250,7 @@ struct llm_build_context {
|
|
7042
7250
|
|
7043
7251
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7044
7252
|
model.layers[il].wo, model.layers[il].bo,
|
7045
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7253
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7046
7254
|
cb(cur, "kqv_out", il);
|
7047
7255
|
}
|
7048
7256
|
|
@@ -7174,7 +7382,7 @@ struct llm_build_context {
|
|
7174
7382
|
|
7175
7383
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7176
7384
|
model.layers[il].wo, model.layers[il].bo,
|
7177
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7385
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7178
7386
|
cb(cur, "kqv_out", il);
|
7179
7387
|
}
|
7180
7388
|
|
@@ -7233,6 +7441,116 @@ struct llm_build_context {
|
|
7233
7441
|
|
7234
7442
|
return gf;
|
7235
7443
|
}
|
7444
|
+
|
7445
|
+
struct ggml_cgraph * build_gemma() {
|
7446
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7447
|
+
|
7448
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
7449
|
+
|
7450
|
+
struct ggml_tensor * cur;
|
7451
|
+
struct ggml_tensor * inpL;
|
7452
|
+
|
7453
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
7454
|
+
cb(inpL, "inp_embd", -1);
|
7455
|
+
|
7456
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
7457
|
+
cb(inpL, "inp_scaled", -1);
|
7458
|
+
|
7459
|
+
// inp_pos - contains the positions
|
7460
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
7461
|
+
cb(inp_pos, "inp_pos", -1);
|
7462
|
+
|
7463
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7464
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
|
+
cb(KQ_mask, "KQ_mask", -1);
|
7466
|
+
|
7467
|
+
// shift the entire K-cache if needed
|
7468
|
+
if (do_rope_shift) {
|
7469
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
+
}
|
7471
|
+
|
7472
|
+
for (int il = 0; il < n_layer; ++il) {
|
7473
|
+
|
7474
|
+
// norm
|
7475
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7476
|
+
model.layers[il].attn_norm, NULL,
|
7477
|
+
LLM_NORM_RMS, cb, il);
|
7478
|
+
cb(cur, "attn_norm", il);
|
7479
|
+
|
7480
|
+
// self-attention
|
7481
|
+
{
|
7482
|
+
// compute Q and K and RoPE them
|
7483
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
7484
|
+
cb(Qcur, "Qcur", il);
|
7485
|
+
|
7486
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
7487
|
+
cb(Kcur, "Kcur", il);
|
7488
|
+
|
7489
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7490
|
+
cb(Vcur, "Vcur", il);
|
7491
|
+
|
7492
|
+
Qcur = ggml_rope_custom(
|
7493
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
|
+
cb(Qcur, "Qcur", il);
|
7497
|
+
|
7498
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
7499
|
+
cb(Qcur, "Qcur_scaled", il);
|
7500
|
+
|
7501
|
+
Kcur = ggml_rope_custom(
|
7502
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
|
+
cb(Kcur, "Kcur", il);
|
7506
|
+
|
7507
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
+
model.layers[il].wo, NULL,
|
7509
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7510
|
+
cb(cur, "kqv_out", il);
|
7511
|
+
}
|
7512
|
+
|
7513
|
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
7514
|
+
cb(sa_out, "sa_out", il);
|
7515
|
+
|
7516
|
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
7517
|
+
model.layers[il].ffn_norm, NULL,
|
7518
|
+
LLM_NORM_RMS, cb, il);
|
7519
|
+
cb(cur, "ffn_norm", il);
|
7520
|
+
|
7521
|
+
// feed-forward network
|
7522
|
+
{
|
7523
|
+
cur = llm_build_ffn(ctx0, cur,
|
7524
|
+
model.layers[il].ffn_up, NULL,
|
7525
|
+
model.layers[il].ffn_gate, NULL,
|
7526
|
+
model.layers[il].ffn_down, NULL,
|
7527
|
+
NULL,
|
7528
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
7529
|
+
cb(cur, "ffn_out", il);
|
7530
|
+
}
|
7531
|
+
|
7532
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
7533
|
+
cb(cur, "l_out", il);
|
7534
|
+
|
7535
|
+
// input for next layer
|
7536
|
+
inpL = cur;
|
7537
|
+
}
|
7538
|
+
|
7539
|
+
cur = inpL;
|
7540
|
+
|
7541
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7542
|
+
model.output_norm, NULL,
|
7543
|
+
LLM_NORM_RMS, cb, -1);
|
7544
|
+
cb(cur, "result_norm", -1);
|
7545
|
+
|
7546
|
+
// lm_head
|
7547
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7548
|
+
cb(cur, "result_output", -1);
|
7549
|
+
|
7550
|
+
ggml_build_forward_expand(gf, cur);
|
7551
|
+
|
7552
|
+
return gf;
|
7553
|
+
}
|
7236
7554
|
};
|
7237
7555
|
|
7238
7556
|
static struct ggml_cgraph * llama_build_graph(
|
@@ -7289,6 +7607,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7289
7607
|
result = llm.build_refact();
|
7290
7608
|
} break;
|
7291
7609
|
case LLM_ARCH_BERT:
|
7610
|
+
case LLM_ARCH_NOMIC_BERT:
|
7292
7611
|
{
|
7293
7612
|
result = llm.build_bert();
|
7294
7613
|
} break;
|
@@ -7340,6 +7659,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7340
7659
|
{
|
7341
7660
|
result = llm.build_minicpm();
|
7342
7661
|
} break;
|
7662
|
+
case LLM_ARCH_GEMMA:
|
7663
|
+
{
|
7664
|
+
result = llm.build_gemma();
|
7665
|
+
} break;
|
7343
7666
|
default:
|
7344
7667
|
GGML_ASSERT(false);
|
7345
7668
|
}
|
@@ -7404,12 +7727,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7404
7727
|
}
|
7405
7728
|
}
|
7406
7729
|
|
7407
|
-
{
|
7408
|
-
|
7409
|
-
|
7730
|
+
if (hparams.need_kq_pos) {
|
7731
|
+
const int64_t n_kv = kv_self.n;
|
7732
|
+
|
7733
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
7410
7734
|
|
7411
|
-
|
7412
|
-
|
7735
|
+
float * data = (float *) lctx.inp_KQ_pos->data;
|
7736
|
+
|
7737
|
+
for (int i = 0; i < n_kv; ++i) {
|
7738
|
+
data[i] = float(lctx.kv_self.cells[i].pos);
|
7413
7739
|
}
|
7414
7740
|
}
|
7415
7741
|
|
@@ -7425,17 +7751,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7425
7751
|
}
|
7426
7752
|
}
|
7427
7753
|
|
7428
|
-
if (
|
7754
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
7429
7755
|
const int64_t n_tokens = batch.n_tokens;
|
7430
7756
|
|
7431
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.
|
7432
|
-
float * data = (float *) lctx.
|
7757
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7758
|
+
float * data = (float *) lctx.inp_mean->data;
|
7433
7759
|
|
7434
|
-
memset(lctx.
|
7760
|
+
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7435
7761
|
|
7762
|
+
std::vector<uint64_t> sum(n_tokens, 0);
|
7436
7763
|
for (int i = 0; i < n_tokens; ++i) {
|
7437
7764
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7438
|
-
|
7765
|
+
sum[seq_id] += 1;
|
7766
|
+
}
|
7767
|
+
|
7768
|
+
std::vector<float> div(n_tokens, 0.0f);
|
7769
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7770
|
+
const uint64_t s = sum[i];
|
7771
|
+
if (s > 0) {
|
7772
|
+
div[i] = 1.0f/float(s);
|
7773
|
+
}
|
7774
|
+
}
|
7775
|
+
|
7776
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7777
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7778
|
+
data[seq_id*n_tokens + i] = div[seq_id];
|
7779
|
+
}
|
7780
|
+
}
|
7781
|
+
|
7782
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
7783
|
+
const int64_t n_tokens = batch.n_tokens;
|
7784
|
+
|
7785
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
7786
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
7787
|
+
|
7788
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7789
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7790
|
+
const llama_pos pos = batch.pos[i];
|
7791
|
+
if (pos == 0) {
|
7792
|
+
data[seq_id] = i;
|
7793
|
+
}
|
7439
7794
|
}
|
7440
7795
|
}
|
7441
7796
|
}
|
@@ -10145,25 +10500,28 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10145
10500
|
return std::make_pair(i_layer, n_layer);
|
10146
10501
|
};
|
10147
10502
|
|
10148
|
-
|
10503
|
+
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
10504
|
+
// with the quantization of the output tensor
|
10505
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
10506
|
+
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
10149
10507
|
int nx = tensor->ne[0];
|
10150
10508
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10151
10509
|
new_type = GGML_TYPE_Q8_0;
|
10152
10510
|
}
|
10153
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10511
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10154
10512
|
new_type = GGML_TYPE_Q5_K;
|
10155
10513
|
}
|
10156
10514
|
else if (new_type != GGML_TYPE_Q8_0) {
|
10157
10515
|
new_type = GGML_TYPE_Q6_K;
|
10158
10516
|
}
|
10159
10517
|
} else if (name == "token_embd.weight") {
|
10160
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10518
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10161
10519
|
new_type = GGML_TYPE_Q2_K;
|
10162
10520
|
}
|
10163
10521
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10164
10522
|
new_type = GGML_TYPE_Q4_K;
|
10165
10523
|
}
|
10166
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10524
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10167
10525
|
if (name.find("attn_v.weight") != std::string::npos) {
|
10168
10526
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
10169
10527
|
else new_type = GGML_TYPE_Q2_K;
|
@@ -10173,6 +10531,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10173
10531
|
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
10174
10532
|
++qs.i_ffn_down;
|
10175
10533
|
}
|
10534
|
+
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
10536
|
+
}
|
10176
10537
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
10177
10538
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
10178
10539
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
@@ -10187,6 +10548,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10187
10548
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
10188
10549
|
}
|
10189
10550
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
10552
|
+
new_type = GGML_TYPE_Q5_K;
|
10553
|
+
}
|
10190
10554
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
10191
10555
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
10192
10556
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
@@ -10239,6 +10603,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10239
10603
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10240
10604
|
}
|
10241
10605
|
}
|
10606
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
10608
|
+
}
|
10242
10609
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10243
10610
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
10244
10611
|
new_type = GGML_TYPE_Q5_K;
|
@@ -10255,7 +10622,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10255
10622
|
if (arch != LLM_ARCH_FALCON) {
|
10256
10623
|
if (qs.model.hparams.n_expert == 8) {
|
10257
10624
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10258
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
10625
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
10259
10626
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
10260
10627
|
new_type = GGML_TYPE_Q5_K;
|
10261
10628
|
}
|
@@ -10306,7 +10673,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10306
10673
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
10307
10674
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
10308
10675
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
10309
|
-
new_type == GGML_TYPE_IQ3_XXS) {
|
10676
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10310
10677
|
int nx = tensor->ne[0];
|
10311
10678
|
int ny = tensor->ne[1];
|
10312
10679
|
if (nx % QK_K != 0) {
|
@@ -10321,8 +10688,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10321
10688
|
case GGML_TYPE_IQ2_XXS:
|
10322
10689
|
case GGML_TYPE_IQ2_XS:
|
10323
10690
|
case GGML_TYPE_IQ3_XXS:
|
10324
|
-
case
|
10325
|
-
case
|
10691
|
+
case GGML_TYPE_IQ1_S:
|
10692
|
+
case GGML_TYPE_Q2_K:
|
10693
|
+
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
10326
10694
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
10327
10695
|
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
10328
10696
|
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
@@ -10363,6 +10731,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10363
10731
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10364
10732
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10365
10733
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
10366
10736
|
|
10367
10737
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
10368
10738
|
}
|
@@ -10536,6 +10906,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10536
10906
|
}
|
10537
10907
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10538
10908
|
new_type == GGML_TYPE_IQ2_XS ||
|
10909
|
+
new_type == GGML_TYPE_IQ1_S ||
|
10539
10910
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10540
10911
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
10541
10912
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -10770,7 +11141,7 @@ static int llama_apply_lora_from_file_internal(
|
|
10770
11141
|
{
|
10771
11142
|
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
10772
11143
|
__func__, ftype);
|
10773
|
-
return
|
11144
|
+
return 1;
|
10774
11145
|
}
|
10775
11146
|
}
|
10776
11147
|
|
@@ -11059,7 +11430,7 @@ bool llama_mlock_supported(void) {
|
|
11059
11430
|
return llama_supports_mlock();
|
11060
11431
|
}
|
11061
11432
|
|
11062
|
-
void llama_backend_init(
|
11433
|
+
void llama_backend_init(void) {
|
11063
11434
|
ggml_time_init();
|
11064
11435
|
|
11065
11436
|
// needed to initialize f16 tables
|
@@ -11069,15 +11440,17 @@ void llama_backend_init(bool numa) {
|
|
11069
11440
|
ggml_free(ctx);
|
11070
11441
|
}
|
11071
11442
|
|
11072
|
-
if (numa) {
|
11073
|
-
ggml_numa_init();
|
11074
|
-
}
|
11075
|
-
|
11076
11443
|
#ifdef GGML_USE_MPI
|
11077
11444
|
ggml_mpi_backend_init();
|
11078
11445
|
#endif
|
11079
11446
|
}
|
11080
11447
|
|
11448
|
+
void llama_numa_init(enum ggml_numa_strategy numa) {
|
11449
|
+
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
11450
|
+
ggml_numa_init(numa);
|
11451
|
+
}
|
11452
|
+
}
|
11453
|
+
|
11081
11454
|
void llama_backend_free(void) {
|
11082
11455
|
#ifdef GGML_USE_MPI
|
11083
11456
|
ggml_mpi_backend_free();
|
@@ -11309,7 +11682,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11309
11682
|
// graph inputs
|
11310
11683
|
{
|
11311
11684
|
ggml_init_params init_params = {
|
11312
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
11685
|
+
/* .mem_size */ ggml_tensor_overhead()*8,
|
11313
11686
|
/* .mem_buffer */ nullptr,
|
11314
11687
|
/* .no_alloc */ true,
|
11315
11688
|
};
|
@@ -11319,15 +11692,19 @@ struct llama_context * llama_new_context_with_model(
|
|
11319
11692
|
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
11320
11693
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
11321
11694
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
11695
|
+
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
11322
11696
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
11323
|
-
ctx->
|
11697
|
+
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
11698
|
+
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
11324
11699
|
|
11325
11700
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
11326
11701
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
11327
11702
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
11328
11703
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
11704
|
+
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
11329
11705
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11330
|
-
ggml_set_name(ctx->
|
11706
|
+
ggml_set_name(ctx->inp_mean, "inp_mean");
|
11707
|
+
ggml_set_name(ctx->inp_cls, "inp_cls");
|
11331
11708
|
|
11332
11709
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
11333
11710
|
|
@@ -11819,18 +12196,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
11819
12196
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
11820
12197
|
|
11821
12198
|
if (kv_buf_size) {
|
11822
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
11823
|
-
|
11824
12199
|
std::vector<uint8_t> tmp_buf;
|
11825
12200
|
for (int il = 0; il < (int) n_layer; ++il) {
|
11826
|
-
|
12201
|
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12202
|
+
tmp_buf.resize(k_size);
|
11827
12203
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
11828
12204
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
11829
12205
|
|
11830
12206
|
// v is not contiguous, copy row by row
|
11831
|
-
|
12207
|
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12208
|
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12209
|
+
tmp_buf.resize(v_row_size);
|
11832
12210
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
11833
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*
|
12211
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
11834
12212
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
11835
12213
|
}
|
11836
12214
|
}
|
@@ -11932,17 +12310,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
11932
12310
|
if (kv_buf_size) {
|
11933
12311
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
11934
12312
|
|
11935
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
11936
|
-
|
11937
12313
|
for (int il = 0; il < (int) n_layer; ++il) {
|
11938
|
-
size_t k_size =
|
12314
|
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
11939
12315
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
11940
12316
|
inp += k_size;
|
11941
12317
|
|
11942
12318
|
// v is not contiguous, copy row by row
|
11943
|
-
size_t v_row_size =
|
12319
|
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12320
|
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
11944
12321
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
11945
|
-
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*
|
12322
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
11946
12323
|
inp += v_row_size;
|
11947
12324
|
}
|
11948
12325
|
}
|
@@ -12332,6 +12709,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
12332
12709
|
return 0;
|
12333
12710
|
}
|
12334
12711
|
|
12712
|
+
// trim whitespace from the beginning and end of a string
|
12713
|
+
static std::string trim(const std::string & str) {
|
12714
|
+
size_t start = 0;
|
12715
|
+
size_t end = str.size();
|
12716
|
+
while (start < end && isspace(str[start])) {
|
12717
|
+
start += 1;
|
12718
|
+
}
|
12719
|
+
while (end > start && isspace(str[end - 1])) {
|
12720
|
+
end -= 1;
|
12721
|
+
}
|
12722
|
+
return str.substr(start, end - start);
|
12723
|
+
}
|
12724
|
+
|
12725
|
+
// Simple version of "llama_apply_chat_template" that only works with strings
|
12726
|
+
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
12727
|
+
static int32_t llama_chat_apply_template_internal(
|
12728
|
+
const std::string & tmpl,
|
12729
|
+
const std::vector<const llama_chat_message *> & chat,
|
12730
|
+
std::string & dest, bool add_ass) {
|
12731
|
+
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
12732
|
+
std::stringstream ss;
|
12733
|
+
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
12734
|
+
// chatml template
|
12735
|
+
for (auto message : chat) {
|
12736
|
+
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
12737
|
+
}
|
12738
|
+
if (add_ass) {
|
12739
|
+
ss << "<|im_start|>assistant\n";
|
12740
|
+
}
|
12741
|
+
} else if (tmpl.find("[INST]") != std::string::npos) {
|
12742
|
+
// llama2 template and its variants
|
12743
|
+
// [variant] support system message
|
12744
|
+
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
12745
|
+
// [variant] space before + after response
|
12746
|
+
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
12747
|
+
// [variant] add BOS inside history
|
12748
|
+
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
12749
|
+
// [variant] trim spaces from the input message
|
12750
|
+
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
12751
|
+
// construct the prompt
|
12752
|
+
bool is_inside_turn = true; // skip BOS at the beginning
|
12753
|
+
ss << "[INST] ";
|
12754
|
+
for (auto message : chat) {
|
12755
|
+
std::string content = strip_message ? trim(message->content) : message->content;
|
12756
|
+
std::string role(message->role);
|
12757
|
+
if (!is_inside_turn) {
|
12758
|
+
is_inside_turn = true;
|
12759
|
+
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
12760
|
+
}
|
12761
|
+
if (role == "system") {
|
12762
|
+
if (support_system_message) {
|
12763
|
+
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
12764
|
+
} else {
|
12765
|
+
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
12766
|
+
ss << content << "\n";
|
12767
|
+
}
|
12768
|
+
} else if (role == "user") {
|
12769
|
+
ss << content << " [/INST]";
|
12770
|
+
} else {
|
12771
|
+
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
12772
|
+
is_inside_turn = false;
|
12773
|
+
}
|
12774
|
+
}
|
12775
|
+
// llama2 templates seem to not care about "add_generation_prompt"
|
12776
|
+
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
12777
|
+
// zephyr template
|
12778
|
+
for (auto message : chat) {
|
12779
|
+
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
12780
|
+
}
|
12781
|
+
if (add_ass) {
|
12782
|
+
ss << "<|assistant|>\n";
|
12783
|
+
}
|
12784
|
+
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
12785
|
+
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
12786
|
+
for (auto message : chat) {
|
12787
|
+
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
12788
|
+
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
12789
|
+
}
|
12790
|
+
if (add_ass) {
|
12791
|
+
ss << "<s>assistant\n";
|
12792
|
+
}
|
12793
|
+
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
12794
|
+
// google/gemma-7b-it
|
12795
|
+
std::string system_prompt = "";
|
12796
|
+
for (auto message : chat) {
|
12797
|
+
std::string role(message->role);
|
12798
|
+
if (role == "system") {
|
12799
|
+
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
12800
|
+
system_prompt = trim(message->content);
|
12801
|
+
continue;
|
12802
|
+
}
|
12803
|
+
// in gemma, "assistant" is "model"
|
12804
|
+
role = role == "assistant" ? "model" : message->role;
|
12805
|
+
ss << "<start_of_turn>" << role << "\n";
|
12806
|
+
if (!system_prompt.empty() && role != "model") {
|
12807
|
+
ss << system_prompt << "\n\n";
|
12808
|
+
system_prompt = "";
|
12809
|
+
}
|
12810
|
+
ss << trim(message->content) << "<end_of_turn>\n";
|
12811
|
+
}
|
12812
|
+
if (add_ass) {
|
12813
|
+
ss << "<start_of_turn>model\n";
|
12814
|
+
}
|
12815
|
+
} else {
|
12816
|
+
// template not supported
|
12817
|
+
return -1;
|
12818
|
+
}
|
12819
|
+
dest = ss.str();
|
12820
|
+
return dest.size();
|
12821
|
+
}
|
12822
|
+
|
12823
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
12824
|
+
const struct llama_model * model,
|
12825
|
+
const char * tmpl,
|
12826
|
+
const struct llama_chat_message * chat,
|
12827
|
+
size_t n_msg,
|
12828
|
+
bool add_ass,
|
12829
|
+
char * buf,
|
12830
|
+
int32_t length) {
|
12831
|
+
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
12832
|
+
if (tmpl == nullptr) {
|
12833
|
+
GGML_ASSERT(model != nullptr);
|
12834
|
+
// load template from model
|
12835
|
+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
12836
|
+
std::string template_key = "tokenizer.chat_template";
|
12837
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
12838
|
+
if (res < 0) {
|
12839
|
+
// worst case: there is no information about template, we will use chatml by default
|
12840
|
+
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
12841
|
+
} else {
|
12842
|
+
curr_tmpl = std::string(model_template.data(), model_template.size());
|
12843
|
+
}
|
12844
|
+
}
|
12845
|
+
// format the chat to string
|
12846
|
+
std::vector<const llama_chat_message *> chat_vec;
|
12847
|
+
chat_vec.resize(n_msg);
|
12848
|
+
for (size_t i = 0; i < n_msg; i++) {
|
12849
|
+
chat_vec[i] = &chat[i];
|
12850
|
+
}
|
12851
|
+
std::string formatted_chat;
|
12852
|
+
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
12853
|
+
if (res < 0) {
|
12854
|
+
return res;
|
12855
|
+
}
|
12856
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
12857
|
+
return res;
|
12858
|
+
}
|
12859
|
+
|
12335
12860
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
12336
12861
|
struct llama_timings result = {
|
12337
12862
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|