llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
|
@@ -68,10 +68,12 @@
|
|
|
68
68
|
#include <cstdio>
|
|
69
69
|
#include <cstring>
|
|
70
70
|
#include <ctime>
|
|
71
|
+
#include <cwctype>
|
|
71
72
|
#include <forward_list>
|
|
72
73
|
#include <fstream>
|
|
73
74
|
#include <functional>
|
|
74
75
|
#include <initializer_list>
|
|
76
|
+
#include <locale>
|
|
75
77
|
#include <map>
|
|
76
78
|
#include <memory>
|
|
77
79
|
#include <mutex>
|
|
@@ -197,6 +199,7 @@ enum llm_arch {
|
|
|
197
199
|
LLM_ARCH_PERSIMMON,
|
|
198
200
|
LLM_ARCH_REFACT,
|
|
199
201
|
LLM_ARCH_BERT,
|
|
202
|
+
LLM_ARCH_NOMIC_BERT,
|
|
200
203
|
LLM_ARCH_BLOOM,
|
|
201
204
|
LLM_ARCH_STABLELM,
|
|
202
205
|
LLM_ARCH_QWEN,
|
|
@@ -207,31 +210,34 @@ enum llm_arch {
|
|
|
207
210
|
LLM_ARCH_ORION,
|
|
208
211
|
LLM_ARCH_INTERNLM2,
|
|
209
212
|
LLM_ARCH_MINICPM,
|
|
213
|
+
LLM_ARCH_GEMMA,
|
|
210
214
|
LLM_ARCH_UNKNOWN,
|
|
211
215
|
};
|
|
212
216
|
|
|
213
217
|
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
214
|
-
{ LLM_ARCH_LLAMA, "llama"
|
|
215
|
-
{ LLM_ARCH_FALCON, "falcon"
|
|
216
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
|
217
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
|
218
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
|
219
|
-
{ LLM_ARCH_MPT, "mpt"
|
|
220
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
|
221
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
|
222
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
|
223
|
-
{ LLM_ARCH_REFACT, "refact"
|
|
224
|
-
{ LLM_ARCH_BERT, "bert"
|
|
225
|
-
{
|
|
226
|
-
{
|
|
227
|
-
{
|
|
228
|
-
{
|
|
229
|
-
{
|
|
230
|
-
{
|
|
231
|
-
{
|
|
232
|
-
{
|
|
233
|
-
{
|
|
234
|
-
{
|
|
218
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
|
219
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
|
220
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
|
221
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
|
222
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
|
223
|
+
{ LLM_ARCH_MPT, "mpt" },
|
|
224
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
|
225
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
226
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
|
227
|
+
{ LLM_ARCH_REFACT, "refact" },
|
|
228
|
+
{ LLM_ARCH_BERT, "bert" },
|
|
229
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
230
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
|
231
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
232
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
|
233
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
|
234
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
|
235
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
|
236
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
237
|
+
{ LLM_ARCH_ORION, "orion" },
|
|
238
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
239
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
|
240
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
|
235
241
|
};
|
|
236
242
|
|
|
237
243
|
enum llm_kv {
|
|
@@ -254,7 +260,7 @@ enum llm_kv {
|
|
|
254
260
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
|
255
261
|
LLM_KV_EXPERT_COUNT,
|
|
256
262
|
LLM_KV_EXPERT_USED_COUNT,
|
|
257
|
-
|
|
263
|
+
LLM_KV_POOLING_TYPE,
|
|
258
264
|
|
|
259
265
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
|
260
266
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
@@ -312,7 +318,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
312
318
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
|
313
319
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
|
314
320
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
|
315
|
-
{
|
|
321
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
|
316
322
|
|
|
317
323
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
|
318
324
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -375,6 +381,7 @@ enum llm_tensor {
|
|
|
375
381
|
LLM_TENSOR_ATTN_OUT,
|
|
376
382
|
LLM_TENSOR_ATTN_NORM,
|
|
377
383
|
LLM_TENSOR_ATTN_NORM_2,
|
|
384
|
+
LLM_TENSOR_ATTN_OUT_NORM,
|
|
378
385
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
379
386
|
LLM_TENSOR_FFN_GATE_INP,
|
|
380
387
|
LLM_TENSOR_FFN_NORM,
|
|
@@ -387,6 +394,7 @@ enum llm_tensor {
|
|
|
387
394
|
LLM_TENSOR_FFN_UP_EXP,
|
|
388
395
|
LLM_TENSOR_ATTN_Q_NORM,
|
|
389
396
|
LLM_TENSOR_ATTN_K_NORM,
|
|
397
|
+
LLM_TENSOR_LAYER_OUT_NORM,
|
|
390
398
|
};
|
|
391
399
|
|
|
392
400
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
@@ -503,7 +511,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
|
503
511
|
{
|
|
504
512
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
505
513
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
506
|
-
{ LLM_TENSOR_OUTPUT, "output" },
|
|
507
514
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
508
515
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
509
516
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
@@ -552,12 +559,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
|
552
559
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
553
560
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
554
561
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
|
555
|
-
{
|
|
562
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
556
563
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
557
564
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
558
565
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
559
566
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
560
|
-
{
|
|
567
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
568
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
569
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
570
|
+
},
|
|
571
|
+
},
|
|
572
|
+
{
|
|
573
|
+
LLM_ARCH_NOMIC_BERT,
|
|
574
|
+
{
|
|
575
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
576
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
577
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
578
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
579
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
580
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
581
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
582
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
561
583
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
562
584
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
563
585
|
},
|
|
@@ -741,6 +763,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
|
741
763
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
742
764
|
},
|
|
743
765
|
},
|
|
766
|
+
{
|
|
767
|
+
LLM_ARCH_GEMMA,
|
|
768
|
+
{
|
|
769
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
770
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
771
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
772
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
773
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
774
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
775
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
776
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
777
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
778
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
779
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
780
|
+
},
|
|
781
|
+
},
|
|
744
782
|
{
|
|
745
783
|
LLM_ARCH_UNKNOWN,
|
|
746
784
|
{
|
|
@@ -814,9 +852,9 @@ struct LLM_TN {
|
|
|
814
852
|
//
|
|
815
853
|
|
|
816
854
|
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
|
817
|
-
{
|
|
818
|
-
{
|
|
819
|
-
{
|
|
855
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
|
856
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
|
857
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
|
820
858
|
};
|
|
821
859
|
|
|
822
860
|
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
@@ -826,7 +864,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
|
826
864
|
}
|
|
827
865
|
}
|
|
828
866
|
|
|
829
|
-
return
|
|
867
|
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
830
868
|
}
|
|
831
869
|
|
|
832
870
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
|
@@ -1015,7 +1053,7 @@ struct llama_mmap {
|
|
|
1015
1053
|
int fd = fileno(file->fp);
|
|
1016
1054
|
int flags = MAP_SHARED;
|
|
1017
1055
|
// prefetch/readahead impairs performance on NUMA systems
|
|
1018
|
-
if (numa)
|
|
1056
|
+
if (numa) { prefetch = 0; }
|
|
1019
1057
|
#ifdef __linux__
|
|
1020
1058
|
// advise the kernel to read the file sequentially (increases readahead)
|
|
1021
1059
|
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
|
@@ -1485,6 +1523,7 @@ enum e_model {
|
|
|
1485
1523
|
MODEL_22M,
|
|
1486
1524
|
MODEL_33M,
|
|
1487
1525
|
MODEL_109M,
|
|
1526
|
+
MODEL_137M,
|
|
1488
1527
|
MODEL_335M,
|
|
1489
1528
|
MODEL_0_5B,
|
|
1490
1529
|
MODEL_1B,
|
|
@@ -1513,8 +1552,9 @@ static const size_t MiB = 1024*kiB;
|
|
|
1513
1552
|
static const size_t GiB = 1024*MiB;
|
|
1514
1553
|
|
|
1515
1554
|
struct llama_hparams {
|
|
1516
|
-
bool
|
|
1517
|
-
bool
|
|
1555
|
+
bool vocab_only;
|
|
1556
|
+
bool rope_finetuned;
|
|
1557
|
+
|
|
1518
1558
|
uint32_t n_vocab;
|
|
1519
1559
|
uint32_t n_ctx_train; // context size the model was trained on
|
|
1520
1560
|
uint32_t n_embd;
|
|
@@ -1537,12 +1577,14 @@ struct llama_hparams {
|
|
|
1537
1577
|
uint32_t n_yarn_orig_ctx;
|
|
1538
1578
|
int32_t rope_scaling_type_train;
|
|
1539
1579
|
|
|
1540
|
-
float f_clamp_kqv;
|
|
1541
|
-
float f_max_alibi_bias;
|
|
1580
|
+
float f_clamp_kqv = 0.0f;
|
|
1581
|
+
float f_max_alibi_bias = 0.0f;
|
|
1542
1582
|
|
|
1543
1583
|
bool causal_attn = true;
|
|
1544
|
-
bool
|
|
1584
|
+
bool need_kq_pos = false;
|
|
1545
1585
|
|
|
1586
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
1587
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
1546
1588
|
|
|
1547
1589
|
bool operator!=(const llama_hparams & other) const {
|
|
1548
1590
|
if (this->vocab_only != other.vocab_only) return true;
|
|
@@ -1601,8 +1643,8 @@ struct llama_cparams {
|
|
|
1601
1643
|
float yarn_attn_factor;
|
|
1602
1644
|
float yarn_beta_fast;
|
|
1603
1645
|
float yarn_beta_slow;
|
|
1646
|
+
float defrag_thold;
|
|
1604
1647
|
|
|
1605
|
-
bool mul_mat_q;
|
|
1606
1648
|
bool offload_kqv;
|
|
1607
1649
|
bool do_pooling;
|
|
1608
1650
|
|
|
@@ -1620,6 +1662,8 @@ struct llama_layer {
|
|
|
1620
1662
|
struct ggml_tensor * attn_q_norm_b;
|
|
1621
1663
|
struct ggml_tensor * attn_k_norm;
|
|
1622
1664
|
struct ggml_tensor * attn_k_norm_b;
|
|
1665
|
+
struct ggml_tensor * attn_out_norm;
|
|
1666
|
+
struct ggml_tensor * attn_out_norm_b;
|
|
1623
1667
|
|
|
1624
1668
|
// attention
|
|
1625
1669
|
struct ggml_tensor * wq;
|
|
@@ -1638,6 +1682,8 @@ struct llama_layer {
|
|
|
1638
1682
|
// normalization
|
|
1639
1683
|
struct ggml_tensor * ffn_norm;
|
|
1640
1684
|
struct ggml_tensor * ffn_norm_b;
|
|
1685
|
+
struct ggml_tensor * layer_out_norm;
|
|
1686
|
+
struct ggml_tensor * layer_out_norm_b;
|
|
1641
1687
|
|
|
1642
1688
|
// ff
|
|
1643
1689
|
struct ggml_tensor * ffn_gate; // w1
|
|
@@ -1665,11 +1711,20 @@ struct llama_kv_cell {
|
|
|
1665
1711
|
bool has_seq_id(const llama_seq_id & id) const {
|
|
1666
1712
|
return seq_id.find(id) != seq_id.end();
|
|
1667
1713
|
}
|
|
1714
|
+
|
|
1715
|
+
bool is_empty() const {
|
|
1716
|
+
return seq_id.empty();
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
bool is_same_seq(const llama_kv_cell & other) const {
|
|
1720
|
+
return seq_id == other.seq_id;
|
|
1721
|
+
}
|
|
1668
1722
|
};
|
|
1669
1723
|
|
|
1670
1724
|
// ring-buffer of cached KV data
|
|
1671
1725
|
struct llama_kv_cache {
|
|
1672
1726
|
bool has_shift = false;
|
|
1727
|
+
bool do_defrag = false;
|
|
1673
1728
|
|
|
1674
1729
|
// Note: The value of head isn't only used to optimize searching
|
|
1675
1730
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
|
@@ -1681,6 +1736,9 @@ struct llama_kv_cache {
|
|
|
1681
1736
|
// computed before each graph build
|
|
1682
1737
|
uint32_t n = 0;
|
|
1683
1738
|
|
|
1739
|
+
ggml_type type_k = GGML_TYPE_F16;
|
|
1740
|
+
ggml_type type_v = GGML_TYPE_F16;
|
|
1741
|
+
|
|
1684
1742
|
std::vector<llama_kv_cell> cells;
|
|
1685
1743
|
|
|
1686
1744
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
|
@@ -1899,8 +1957,10 @@ struct llama_context {
|
|
|
1899
1957
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
|
1900
1958
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
|
1901
1959
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
|
1960
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
|
1902
1961
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
|
1903
|
-
struct ggml_tensor *
|
|
1962
|
+
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
|
1963
|
+
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
|
1904
1964
|
|
|
1905
1965
|
#ifdef GGML_USE_MPI
|
|
1906
1966
|
ggml_mpi_context * ctx_mpi = NULL;
|
|
@@ -1914,8 +1974,8 @@ struct llama_context {
|
|
|
1914
1974
|
static bool llama_kv_cache_init(
|
|
1915
1975
|
struct llama_kv_cache & cache,
|
|
1916
1976
|
const llama_model & model,
|
|
1917
|
-
ggml_type
|
|
1918
|
-
ggml_type
|
|
1977
|
+
ggml_type type_k,
|
|
1978
|
+
ggml_type type_v,
|
|
1919
1979
|
uint32_t n_ctx,
|
|
1920
1980
|
bool offload) {
|
|
1921
1981
|
const struct llama_hparams & hparams = model.hparams;
|
|
@@ -1930,6 +1990,9 @@ static bool llama_kv_cache_init(
|
|
|
1930
1990
|
cache.size = n_ctx;
|
|
1931
1991
|
cache.used = 0;
|
|
1932
1992
|
|
|
1993
|
+
cache.type_k = type_k;
|
|
1994
|
+
cache.type_v = type_v;
|
|
1995
|
+
|
|
1933
1996
|
cache.cells.clear();
|
|
1934
1997
|
cache.cells.resize(n_ctx);
|
|
1935
1998
|
|
|
@@ -1970,8 +2033,8 @@ static bool llama_kv_cache_init(
|
|
|
1970
2033
|
|
|
1971
2034
|
for (int i = 0; i < (int) n_layer; i++) {
|
|
1972
2035
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
|
1973
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx,
|
|
1974
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx,
|
|
2036
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
|
|
2037
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
|
|
1975
2038
|
ggml_format_name(k, "cache_k_l%d", i);
|
|
1976
2039
|
ggml_format_name(v, "cache_v_l%d", i);
|
|
1977
2040
|
cache.k_l.push_back(k);
|
|
@@ -2055,7 +2118,7 @@ static bool llama_kv_cache_find_slot(
|
|
|
2055
2118
|
// find how many cells are currently in use
|
|
2056
2119
|
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
|
2057
2120
|
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
|
2058
|
-
if (cache.cells[i].pos >= 0 && !cache.cells[i].
|
|
2121
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
|
|
2059
2122
|
return i + 1;
|
|
2060
2123
|
}
|
|
2061
2124
|
}
|
|
@@ -2091,7 +2154,7 @@ static void llama_kv_cache_seq_rm(
|
|
|
2091
2154
|
} else {
|
|
2092
2155
|
continue;
|
|
2093
2156
|
}
|
|
2094
|
-
if (cache.cells[i].
|
|
2157
|
+
if (cache.cells[i].is_empty()) {
|
|
2095
2158
|
// keep count of the number of used cells
|
|
2096
2159
|
if (cache.cells[i].pos >= 0) cache.used--;
|
|
2097
2160
|
|
|
@@ -2142,7 +2205,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
|
2142
2205
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
|
2143
2206
|
}
|
|
2144
2207
|
|
|
2145
|
-
static void
|
|
2208
|
+
static void llama_kv_cache_seq_add(
|
|
2146
2209
|
struct llama_kv_cache & cache,
|
|
2147
2210
|
llama_seq_id seq_id,
|
|
2148
2211
|
llama_pos p0,
|
|
@@ -2160,10 +2223,14 @@ static void llama_kv_cache_seq_shift(
|
|
|
2160
2223
|
cache.cells[i].delta += delta;
|
|
2161
2224
|
|
|
2162
2225
|
if (cache.cells[i].pos < 0) {
|
|
2163
|
-
if (!cache.cells[i].
|
|
2226
|
+
if (!cache.cells[i].is_empty()) {
|
|
2227
|
+
cache.used--;
|
|
2228
|
+
}
|
|
2164
2229
|
cache.cells[i].pos = -1;
|
|
2165
2230
|
cache.cells[i].seq_id.clear();
|
|
2166
|
-
if (new_head == cache.size)
|
|
2231
|
+
if (new_head == cache.size) {
|
|
2232
|
+
new_head = i;
|
|
2233
|
+
}
|
|
2167
2234
|
}
|
|
2168
2235
|
}
|
|
2169
2236
|
}
|
|
@@ -2195,6 +2262,22 @@ static void llama_kv_cache_seq_div(
|
|
|
2195
2262
|
}
|
|
2196
2263
|
}
|
|
2197
2264
|
|
|
2265
|
+
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
|
2266
|
+
llama_pos result = 0;
|
|
2267
|
+
|
|
2268
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
|
2269
|
+
if (cache.cells[i].has_seq_id(seq_id)) {
|
|
2270
|
+
result = std::max(result, cache.cells[i].pos);
|
|
2271
|
+
}
|
|
2272
|
+
}
|
|
2273
|
+
|
|
2274
|
+
return result;
|
|
2275
|
+
}
|
|
2276
|
+
|
|
2277
|
+
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
2278
|
+
cache.do_defrag = true;
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2198
2281
|
//
|
|
2199
2282
|
// model loading and saving
|
|
2200
2283
|
//
|
|
@@ -2266,7 +2349,7 @@ namespace GGUFMeta {
|
|
|
2266
2349
|
}
|
|
2267
2350
|
};
|
|
2268
2351
|
|
|
2269
|
-
struct ArrayInfo{
|
|
2352
|
+
struct ArrayInfo {
|
|
2270
2353
|
const gguf_type gt;
|
|
2271
2354
|
const size_t length;
|
|
2272
2355
|
const void * data;
|
|
@@ -2285,7 +2368,7 @@ namespace GGUFMeta {
|
|
|
2285
2368
|
};
|
|
2286
2369
|
|
|
2287
2370
|
template<typename T>
|
|
2288
|
-
class GKV: public GKV_Base<T> {
|
|
2371
|
+
class GKV : public GKV_Base<T> {
|
|
2289
2372
|
GKV() = delete;
|
|
2290
2373
|
|
|
2291
2374
|
public:
|
|
@@ -2301,46 +2384,46 @@ namespace GGUFMeta {
|
|
|
2301
2384
|
|
|
2302
2385
|
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
|
2303
2386
|
switch (ty) {
|
|
2304
|
-
case
|
|
2305
|
-
case
|
|
2306
|
-
case
|
|
2387
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
|
2388
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
|
2389
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
|
2307
2390
|
}
|
|
2308
2391
|
return "unknown";
|
|
2309
2392
|
}
|
|
2310
2393
|
|
|
2311
|
-
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *
|
|
2312
|
-
if (!
|
|
2313
|
-
if (
|
|
2394
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
|
|
2395
|
+
if (!ovrd) { return false; }
|
|
2396
|
+
if (ovrd->tag == expected_type) {
|
|
2314
2397
|
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
|
2315
|
-
__func__, override_type_to_str(
|
|
2316
|
-
switch (
|
|
2317
|
-
case
|
|
2318
|
-
LLAMA_LOG_INFO("%s\n",
|
|
2398
|
+
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
|
2399
|
+
switch (ovrd->tag) {
|
|
2400
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
|
2401
|
+
LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
|
|
2319
2402
|
} break;
|
|
2320
|
-
case
|
|
2321
|
-
LLAMA_LOG_INFO("%" PRId64 "\n",
|
|
2403
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
|
2404
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
|
|
2322
2405
|
} break;
|
|
2323
|
-
case
|
|
2324
|
-
LLAMA_LOG_INFO("%.6f\n",
|
|
2406
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
|
2407
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
|
|
2325
2408
|
} break;
|
|
2326
2409
|
default:
|
|
2327
2410
|
// Shouldn't be possible to end up here, but just in case...
|
|
2328
2411
|
throw std::runtime_error(
|
|
2329
2412
|
format("Unsupported attempt to override %s type for metadata key %s\n",
|
|
2330
|
-
override_type_to_str(
|
|
2413
|
+
override_type_to_str(ovrd->tag), ovrd->key));
|
|
2331
2414
|
}
|
|
2332
2415
|
return true;
|
|
2333
2416
|
}
|
|
2334
2417
|
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
|
2335
|
-
__func__,
|
|
2418
|
+
__func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
|
|
2336
2419
|
return false;
|
|
2337
2420
|
}
|
|
2338
2421
|
|
|
2339
2422
|
template<typename OT>
|
|
2340
2423
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
|
2341
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
|
2342
|
-
if (validate_override(
|
|
2343
|
-
target =
|
|
2424
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
|
2425
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
|
2426
|
+
target = ovrd->bool_value;
|
|
2344
2427
|
return true;
|
|
2345
2428
|
}
|
|
2346
2429
|
return false;
|
|
@@ -2348,9 +2431,9 @@ namespace GGUFMeta {
|
|
|
2348
2431
|
|
|
2349
2432
|
template<typename OT>
|
|
2350
2433
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
|
2351
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
|
2352
|
-
if (validate_override(
|
|
2353
|
-
target =
|
|
2434
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
|
2435
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
|
2436
|
+
target = ovrd->int_value;
|
|
2354
2437
|
return true;
|
|
2355
2438
|
}
|
|
2356
2439
|
return false;
|
|
@@ -2358,9 +2441,9 @@ namespace GGUFMeta {
|
|
|
2358
2441
|
|
|
2359
2442
|
template<typename OT>
|
|
2360
2443
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
|
2361
|
-
try_override(T & target, const struct llama_model_kv_override *
|
|
2362
|
-
if (validate_override(
|
|
2363
|
-
target =
|
|
2444
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
|
2445
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
|
2446
|
+
target = ovrd->float_value;
|
|
2364
2447
|
return true;
|
|
2365
2448
|
}
|
|
2366
2449
|
return false;
|
|
@@ -2368,17 +2451,17 @@ namespace GGUFMeta {
|
|
|
2368
2451
|
|
|
2369
2452
|
template<typename OT>
|
|
2370
2453
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
|
2371
|
-
try_override(T & target, const struct llama_model_kv_override *
|
|
2454
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
|
2372
2455
|
(void)target;
|
|
2373
|
-
(void)
|
|
2374
|
-
if (!
|
|
2456
|
+
(void)ovrd;
|
|
2457
|
+
if (!ovrd) { return false; }
|
|
2375
2458
|
// Currently, we should never end up here so it would be a bug if we do.
|
|
2376
2459
|
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
|
2377
|
-
|
|
2460
|
+
ovrd ? ovrd->key : "NULL"));
|
|
2378
2461
|
}
|
|
2379
2462
|
|
|
2380
|
-
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *
|
|
2381
|
-
if (try_override<T>(target,
|
|
2463
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
|
2464
|
+
if (try_override<T>(target, ovrd)) {
|
|
2382
2465
|
return true;
|
|
2383
2466
|
}
|
|
2384
2467
|
if (k < 0) { return false; }
|
|
@@ -2386,12 +2469,12 @@ namespace GGUFMeta {
|
|
|
2386
2469
|
return true;
|
|
2387
2470
|
}
|
|
2388
2471
|
|
|
2389
|
-
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *
|
|
2390
|
-
return set(ctx, gguf_find_key(ctx, key), target,
|
|
2472
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
|
2473
|
+
return set(ctx, gguf_find_key(ctx, key), target, ovrd);
|
|
2391
2474
|
}
|
|
2392
2475
|
|
|
2393
|
-
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *
|
|
2394
|
-
return set(ctx, key.c_str(), target,
|
|
2476
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
|
2477
|
+
return set(ctx, key.c_str(), target, ovrd);
|
|
2395
2478
|
}
|
|
2396
2479
|
};
|
|
2397
2480
|
}
|
|
@@ -2498,7 +2581,12 @@ struct llama_model_loader {
|
|
|
2498
2581
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
|
2499
2582
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
|
2500
2583
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
|
2584
|
+
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
|
2501
2585
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
|
2586
|
+
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
|
2587
|
+
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
|
2588
|
+
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
|
2589
|
+
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
|
2502
2590
|
default:
|
|
2503
2591
|
{
|
|
2504
2592
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2744,13 +2832,7 @@ struct llama_model_loader {
|
|
|
2744
2832
|
|
|
2745
2833
|
std::vector<no_init<uint8_t>> read_buf;
|
|
2746
2834
|
|
|
2747
|
-
for (
|
|
2748
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
|
2749
|
-
if (!cur) {
|
|
2750
|
-
// some tensors may be allocated in a different context
|
|
2751
|
-
continue;
|
|
2752
|
-
}
|
|
2753
|
-
|
|
2835
|
+
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
2754
2836
|
if (progress_callback) {
|
|
2755
2837
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
|
2756
2838
|
return false;
|
|
@@ -2805,6 +2887,15 @@ struct llama_model_loader {
|
|
|
2805
2887
|
}
|
|
2806
2888
|
};
|
|
2807
2889
|
|
|
2890
|
+
template<>
|
|
2891
|
+
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
|
2892
|
+
uint32_t tmp;
|
|
2893
|
+
const bool found = get_key(kid, tmp, required);
|
|
2894
|
+
result = (enum llama_pooling_type) tmp;
|
|
2895
|
+
return found;
|
|
2896
|
+
}
|
|
2897
|
+
|
|
2898
|
+
|
|
2808
2899
|
//
|
|
2809
2900
|
// load LLaMA models
|
|
2810
2901
|
//
|
|
@@ -2846,8 +2937,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
2846
2937
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
|
2847
2938
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
|
2848
2939
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
|
2849
|
-
case
|
|
2940
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
|
2941
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
|
2942
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
|
2850
2943
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
|
2944
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
|
2945
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
|
2946
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
|
2947
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
|
2948
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
|
2851
2949
|
|
|
2852
2950
|
default: return "unknown, may not work";
|
|
2853
2951
|
}
|
|
@@ -2855,6 +2953,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
2855
2953
|
|
|
2856
2954
|
static const char * llama_model_type_name(e_model type) {
|
|
2857
2955
|
switch (type) {
|
|
2956
|
+
case MODEL_22M: return "22M";
|
|
2957
|
+
case MODEL_33M: return "33M";
|
|
2958
|
+
case MODEL_109M: return "109M";
|
|
2959
|
+
case MODEL_137M: return "137M";
|
|
2960
|
+
case MODEL_0_5B: return "0.5B";
|
|
2858
2961
|
case MODEL_1B: return "1B";
|
|
2859
2962
|
case MODEL_2B: return "2B";
|
|
2860
2963
|
case MODEL_3B: return "3B";
|
|
@@ -2876,16 +2979,16 @@ static const char * llama_model_type_name(e_model type) {
|
|
|
2876
2979
|
default: return "?B";
|
|
2877
2980
|
}
|
|
2878
2981
|
}
|
|
2982
|
+
|
|
2879
2983
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
2880
2984
|
switch (type) {
|
|
2881
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
|
2882
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
|
2883
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
|
2884
|
-
default:
|
|
2985
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
|
2986
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
|
2987
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
|
2988
|
+
default: return "unknown";
|
|
2885
2989
|
}
|
|
2886
2990
|
}
|
|
2887
2991
|
|
|
2888
|
-
|
|
2889
2992
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
2890
2993
|
model.arch = ml.get_arch();
|
|
2891
2994
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
|
@@ -2949,7 +3052,7 @@ static void llm_load_hparams(
|
|
|
2949
3052
|
std::string rope_scaling("linear");
|
|
2950
3053
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
|
2951
3054
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
2952
|
-
GGML_ASSERT(hparams.rope_scaling_type_train !=
|
|
3055
|
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
2953
3056
|
|
|
2954
3057
|
// rope_freq_scale (inverse of the kv) is optional
|
|
2955
3058
|
float ropescale = 0.0f;
|
|
@@ -3024,6 +3127,11 @@ static void llm_load_hparams(
|
|
|
3024
3127
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
3025
3128
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3026
3129
|
}
|
|
3130
|
+
|
|
3131
|
+
if (model.type == e_model::MODEL_13B) {
|
|
3132
|
+
// TODO: become GGUF KV parameter
|
|
3133
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
3134
|
+
}
|
|
3027
3135
|
} break;
|
|
3028
3136
|
case LLM_ARCH_STARCODER:
|
|
3029
3137
|
{
|
|
@@ -3051,13 +3159,16 @@ static void llm_load_hparams(
|
|
|
3051
3159
|
case 32: model.type = e_model::MODEL_1B; break;
|
|
3052
3160
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3053
3161
|
}
|
|
3162
|
+
|
|
3163
|
+
// TODO: become GGUF KV parameter
|
|
3164
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
3054
3165
|
} break;
|
|
3055
3166
|
case LLM_ARCH_BERT:
|
|
3056
3167
|
{
|
|
3057
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
|
3058
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
|
3168
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3169
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
3059
3170
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
3060
|
-
ml.get_key(
|
|
3171
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
3061
3172
|
|
|
3062
3173
|
switch (hparams.n_layer) {
|
|
3063
3174
|
case 3:
|
|
@@ -3073,6 +3184,17 @@ static void llm_load_hparams(
|
|
|
3073
3184
|
model.type = e_model::MODEL_335M; break; // bge-large
|
|
3074
3185
|
}
|
|
3075
3186
|
} break;
|
|
3187
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
3188
|
+
{
|
|
3189
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3190
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
3191
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
3192
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
3193
|
+
|
|
3194
|
+
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
3195
|
+
model.type = e_model::MODEL_137M;
|
|
3196
|
+
}
|
|
3197
|
+
} break;
|
|
3076
3198
|
case LLM_ARCH_BLOOM:
|
|
3077
3199
|
{
|
|
3078
3200
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -3085,11 +3207,12 @@ static void llm_load_hparams(
|
|
|
3085
3207
|
case 4096: model.type = e_model::MODEL_7B; break;
|
|
3086
3208
|
} break;
|
|
3087
3209
|
}
|
|
3210
|
+
|
|
3211
|
+
// TODO: become GGUF KV parameter
|
|
3212
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
3088
3213
|
} break;
|
|
3089
3214
|
case LLM_ARCH_MPT:
|
|
3090
3215
|
{
|
|
3091
|
-
hparams.f_clamp_kqv = 0.0f;
|
|
3092
|
-
|
|
3093
3216
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3094
3217
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
|
3095
3218
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
@@ -3187,10 +3310,26 @@ static void llm_load_hparams(
|
|
|
3187
3310
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3188
3311
|
}
|
|
3189
3312
|
} break;
|
|
3313
|
+
case LLM_ARCH_GEMMA:
|
|
3314
|
+
{
|
|
3315
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
3316
|
+
|
|
3317
|
+
switch (hparams.n_layer) {
|
|
3318
|
+
case 18: model.type = e_model::MODEL_2B; break;
|
|
3319
|
+
case 28: model.type = e_model::MODEL_7B; break;
|
|
3320
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3321
|
+
}
|
|
3322
|
+
} break;
|
|
3190
3323
|
default: (void)0;
|
|
3191
3324
|
}
|
|
3192
3325
|
|
|
3193
3326
|
model.ftype = ml.ftype;
|
|
3327
|
+
|
|
3328
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
|
3329
|
+
hparams.need_kq_pos = true;
|
|
3330
|
+
}
|
|
3331
|
+
|
|
3332
|
+
hparams.rope_type = llama_rope_type(&model);
|
|
3194
3333
|
}
|
|
3195
3334
|
|
|
3196
3335
|
// TODO: This should probably be in llama.h
|
|
@@ -3493,6 +3632,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
3493
3632
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
|
3494
3633
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
3495
3634
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
3635
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
3636
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
3496
3637
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
3497
3638
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
3498
3639
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
@@ -3559,7 +3700,7 @@ static bool llm_load_tensors(
|
|
|
3559
3700
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
|
3560
3701
|
}
|
|
3561
3702
|
|
|
3562
|
-
if (split_mode ==
|
|
3703
|
+
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
|
3563
3704
|
// calculate the split points
|
|
3564
3705
|
int device_count = llama_get_device_count();
|
|
3565
3706
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
@@ -3598,10 +3739,10 @@ static bool llm_load_tensors(
|
|
|
3598
3739
|
}
|
|
3599
3740
|
} else {
|
|
3600
3741
|
ggml_backend_buffer_type_t split_buft;
|
|
3601
|
-
if (split_mode ==
|
|
3742
|
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
3602
3743
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
3603
3744
|
} else {
|
|
3604
|
-
//
|
|
3745
|
+
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
|
3605
3746
|
split_buft = llama_default_buffer_type_offload(main_gpu);
|
|
3606
3747
|
}
|
|
3607
3748
|
// assign the repeating layers
|
|
@@ -3634,7 +3775,7 @@ static bool llm_load_tensors(
|
|
|
3634
3775
|
}
|
|
3635
3776
|
|
|
3636
3777
|
// create one context per buffer type
|
|
3637
|
-
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
|
3778
|
+
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
|
3638
3779
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
3639
3780
|
for (auto & it : buft_layer_count) {
|
|
3640
3781
|
struct ggml_init_params params = {
|
|
@@ -3772,6 +3913,7 @@ static bool llm_load_tensors(
|
|
|
3772
3913
|
} else {
|
|
3773
3914
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
|
3774
3915
|
ml.n_created--; // artificial tensor
|
|
3916
|
+
ml.size_data += ggml_nbytes(model.output);
|
|
3775
3917
|
}
|
|
3776
3918
|
}
|
|
3777
3919
|
|
|
@@ -3875,10 +4017,14 @@ static bool llm_load_tensors(
|
|
|
3875
4017
|
}
|
|
3876
4018
|
} break;
|
|
3877
4019
|
case LLM_ARCH_BERT:
|
|
4020
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
3878
4021
|
{
|
|
3879
|
-
model.tok_embd
|
|
3880
|
-
model.type_embd
|
|
3881
|
-
model.
|
|
4022
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4023
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
|
4024
|
+
if (model.arch == LLM_ARCH_BERT) {
|
|
4025
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
|
4026
|
+
}
|
|
4027
|
+
|
|
3882
4028
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
|
3883
4029
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
|
3884
4030
|
|
|
@@ -3888,29 +4034,38 @@ static bool llm_load_tensors(
|
|
|
3888
4034
|
|
|
3889
4035
|
auto & layer = model.layers[i];
|
|
3890
4036
|
|
|
3891
|
-
|
|
3892
|
-
|
|
4037
|
+
if (model.arch == LLM_ARCH_BERT) {
|
|
4038
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
4039
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
|
3893
4040
|
|
|
3894
|
-
|
|
3895
|
-
|
|
4041
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
4042
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
3896
4043
|
|
|
3897
|
-
|
|
3898
|
-
|
|
4044
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
4045
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
|
4046
|
+
} else {
|
|
4047
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
4048
|
+
}
|
|
3899
4049
|
|
|
3900
|
-
layer.
|
|
3901
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
4050
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3902
4051
|
|
|
3903
|
-
layer.
|
|
3904
|
-
layer.
|
|
4052
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
|
4053
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
|
3905
4054
|
|
|
3906
|
-
layer.
|
|
3907
|
-
layer.
|
|
4055
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
4056
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
3908
4057
|
|
|
3909
|
-
|
|
3910
|
-
|
|
4058
|
+
if (model.arch == LLM_ARCH_BERT) {
|
|
4059
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
4060
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
3911
4061
|
|
|
3912
|
-
|
|
3913
|
-
|
|
4062
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
4063
|
+
} else {
|
|
4064
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
4065
|
+
}
|
|
4066
|
+
|
|
4067
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
4068
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
|
3914
4069
|
}
|
|
3915
4070
|
} break;
|
|
3916
4071
|
case LLM_ARCH_BLOOM:
|
|
@@ -3958,7 +4113,12 @@ static bool llm_load_tensors(
|
|
|
3958
4113
|
// output
|
|
3959
4114
|
{
|
|
3960
4115
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3961
|
-
model.
|
|
4116
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
|
4117
|
+
|
|
4118
|
+
// same as tok_embd, duplicated to allow offloading
|
|
4119
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4120
|
+
ml.n_created--; // artificial tensor
|
|
4121
|
+
ml.size_data += ggml_nbytes(model.output);
|
|
3962
4122
|
}
|
|
3963
4123
|
|
|
3964
4124
|
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -3967,14 +4127,23 @@ static bool llm_load_tensors(
|
|
|
3967
4127
|
|
|
3968
4128
|
auto & layer = model.layers[i];
|
|
3969
4129
|
|
|
3970
|
-
layer.attn_norm
|
|
4130
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
4131
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
|
3971
4132
|
|
|
3972
4133
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
4134
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
|
4135
|
+
|
|
3973
4136
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
4137
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
|
3974
4138
|
|
|
3975
|
-
layer.ffn_norm
|
|
3976
|
-
layer.
|
|
3977
|
-
|
|
4139
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
4140
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
|
4141
|
+
|
|
4142
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
4143
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
|
4144
|
+
|
|
4145
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
4146
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
|
3978
4147
|
|
|
3979
4148
|
// AWQ ScaleActivation layer
|
|
3980
4149
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
|
@@ -4287,6 +4456,40 @@ static bool llm_load_tensors(
|
|
|
4287
4456
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
4288
4457
|
}
|
|
4289
4458
|
} break;
|
|
4459
|
+
case LLM_ARCH_GEMMA:
|
|
4460
|
+
{
|
|
4461
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4462
|
+
|
|
4463
|
+
// output
|
|
4464
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
4465
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
|
4466
|
+
ml.n_created--; // artificial tensor
|
|
4467
|
+
ml.size_data += ggml_nbytes(model.output);
|
|
4468
|
+
|
|
4469
|
+
const int64_t n_ff = hparams.n_ff;
|
|
4470
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
4471
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
4472
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
4473
|
+
|
|
4474
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
4475
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
4476
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
4477
|
+
|
|
4478
|
+
auto & layer = model.layers[i];
|
|
4479
|
+
|
|
4480
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
4481
|
+
|
|
4482
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
|
4483
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
|
4484
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
|
4485
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
|
4486
|
+
|
|
4487
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
4488
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
4489
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
4490
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
4491
|
+
}
|
|
4492
|
+
} break;
|
|
4290
4493
|
default:
|
|
4291
4494
|
throw std::runtime_error("unknown architecture");
|
|
4292
4495
|
}
|
|
@@ -4452,12 +4655,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
|
4452
4655
|
|
|
4453
4656
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
|
4454
4657
|
|
|
4455
|
-
enum llm_rope_type {
|
|
4456
|
-
LLM_ROPE,
|
|
4457
|
-
LLM_ROPE_NEOX,
|
|
4458
|
-
LLM_ROPE_GLM,
|
|
4459
|
-
};
|
|
4460
|
-
|
|
4461
4658
|
enum llm_ffn_op_type {
|
|
4462
4659
|
LLM_FFN_SILU,
|
|
4463
4660
|
LLM_FFN_GELU,
|
|
@@ -4503,55 +4700,6 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
|
4503
4700
|
return inpL;
|
|
4504
4701
|
}
|
|
4505
4702
|
|
|
4506
|
-
// Persimmon: n_rot = n_embd_head_k/2
|
|
4507
|
-
// Other: n_rot = n_embd_head_k
|
|
4508
|
-
static void llm_build_k_shift(
|
|
4509
|
-
struct ggml_context * ctx,
|
|
4510
|
-
const llama_hparams & hparams,
|
|
4511
|
-
const llama_cparams & cparams,
|
|
4512
|
-
const llama_kv_cache & kv,
|
|
4513
|
-
struct ggml_cgraph * graph,
|
|
4514
|
-
struct ggml_tensor * K_shift,
|
|
4515
|
-
llm_rope_type type,
|
|
4516
|
-
int64_t n_ctx,
|
|
4517
|
-
float freq_base,
|
|
4518
|
-
float freq_scale,
|
|
4519
|
-
const llm_build_cb & cb) {
|
|
4520
|
-
const int64_t n_layer = hparams.n_layer;
|
|
4521
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
|
4522
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
4523
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
4524
|
-
const int32_t n_rot = hparams.n_rot;
|
|
4525
|
-
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
|
4526
|
-
const float ext_factor = cparams.yarn_ext_factor;
|
|
4527
|
-
const float attn_factor = cparams.yarn_attn_factor;
|
|
4528
|
-
const float beta_fast = cparams.yarn_beta_fast;
|
|
4529
|
-
const float beta_slow = cparams.yarn_beta_slow;
|
|
4530
|
-
|
|
4531
|
-
int rope_type = 0;
|
|
4532
|
-
|
|
4533
|
-
switch (type) {
|
|
4534
|
-
case LLM_ROPE: rope_type = 0; break;
|
|
4535
|
-
case LLM_ROPE_NEOX: rope_type = 2; break;
|
|
4536
|
-
case LLM_ROPE_GLM: rope_type = 4; break;
|
|
4537
|
-
}
|
|
4538
|
-
|
|
4539
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
4540
|
-
struct ggml_tensor * tmp =
|
|
4541
|
-
// we rotate only the first n_rot dimensions
|
|
4542
|
-
ggml_rope_custom_inplace(ctx,
|
|
4543
|
-
ggml_view_3d(ctx, kv.k_l[il],
|
|
4544
|
-
n_embd_head_k, n_head_kv, n_ctx,
|
|
4545
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
|
4546
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
|
4547
|
-
0),
|
|
4548
|
-
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
4549
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
4550
|
-
cb(tmp, "K_shifted", il);
|
|
4551
|
-
ggml_build_forward_expand(graph, tmp);
|
|
4552
|
-
}
|
|
4553
|
-
}
|
|
4554
|
-
|
|
4555
4703
|
static void llm_build_kv_store(
|
|
4556
4704
|
struct ggml_context * ctx,
|
|
4557
4705
|
const llama_hparams & hparams,
|
|
@@ -4720,10 +4868,10 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
4720
4868
|
struct ggml_tensor * wo_b,
|
|
4721
4869
|
struct ggml_tensor * q_cur,
|
|
4722
4870
|
struct ggml_tensor * kq_mask,
|
|
4871
|
+
struct ggml_tensor * kq_pos,
|
|
4723
4872
|
int64_t n_ctx,
|
|
4724
4873
|
int32_t n_tokens,
|
|
4725
4874
|
int32_t n_kv,
|
|
4726
|
-
float max_alibi_bias,
|
|
4727
4875
|
float kq_scale,
|
|
4728
4876
|
const llm_build_cb & cb,
|
|
4729
4877
|
int il) {
|
|
@@ -4753,26 +4901,26 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
4753
4901
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
4754
4902
|
}
|
|
4755
4903
|
|
|
4756
|
-
|
|
4757
|
-
|
|
4904
|
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
|
4905
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
|
|
4906
|
+
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
|
4907
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
|
4908
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
|
4758
4909
|
kq = ggml_scale(ctx, kq, kq_scale);
|
|
4759
4910
|
cb(kq, "kq_scaled", il);
|
|
4760
4911
|
|
|
4761
|
-
|
|
4762
|
-
|
|
4763
|
-
// TODO: K-shift is likely not working
|
|
4764
|
-
// TODO: change to ggml_add
|
|
4765
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
|
4766
|
-
cb(kq, "kq_scaled_alibi", il);
|
|
4767
|
-
}
|
|
4912
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
|
4913
|
+
cb(kq, "kq_scaled_alibi", il);
|
|
4768
4914
|
|
|
4769
4915
|
kq = ggml_add(ctx, kq, kq_mask);
|
|
4770
4916
|
cb(kq, "kq_masked", il);
|
|
4771
4917
|
|
|
4772
4918
|
kq = ggml_soft_max(ctx, kq);
|
|
4773
4919
|
cb(kq, "kq_soft_max", il);
|
|
4774
|
-
} else
|
|
4775
|
-
|
|
4920
|
+
} else
|
|
4921
|
+
#endif
|
|
4922
|
+
{
|
|
4923
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
|
4776
4924
|
cb(kq, "kq_soft_max_ext", il);
|
|
4777
4925
|
}
|
|
4778
4926
|
|
|
@@ -4820,11 +4968,11 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
4820
4968
|
struct ggml_tensor * v_cur,
|
|
4821
4969
|
struct ggml_tensor * q_cur,
|
|
4822
4970
|
struct ggml_tensor * kq_mask,
|
|
4971
|
+
struct ggml_tensor * kq_pos,
|
|
4823
4972
|
int64_t n_ctx,
|
|
4824
4973
|
int32_t n_tokens,
|
|
4825
4974
|
int32_t kv_head,
|
|
4826
4975
|
int32_t n_kv,
|
|
4827
|
-
float max_alibi_bias,
|
|
4828
4976
|
float kq_scale,
|
|
4829
4977
|
const llm_build_cb & cb,
|
|
4830
4978
|
int il) {
|
|
@@ -4838,9 +4986,8 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
4838
4986
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
|
4839
4987
|
|
|
4840
4988
|
struct ggml_tensor * cur;
|
|
4841
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
|
4842
|
-
|
|
4843
|
-
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
|
4989
|
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
|
4990
|
+
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
|
4844
4991
|
cb(cur, "kqv_out", il);
|
|
4845
4992
|
|
|
4846
4993
|
return cur;
|
|
@@ -4856,6 +5003,7 @@ struct llm_build_context {
|
|
|
4856
5003
|
|
|
4857
5004
|
const int64_t n_embd;
|
|
4858
5005
|
const int64_t n_layer;
|
|
5006
|
+
const int64_t n_rot;
|
|
4859
5007
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
|
4860
5008
|
const int64_t n_head;
|
|
4861
5009
|
const int64_t n_head_kv;
|
|
@@ -4880,8 +5028,8 @@ struct llm_build_context {
|
|
|
4880
5028
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
4881
5029
|
const int32_t n_orig_ctx;
|
|
4882
5030
|
|
|
4883
|
-
const
|
|
4884
|
-
const
|
|
5031
|
+
const enum llama_pooling_type pooling_type;
|
|
5032
|
+
const enum llama_rope_type rope_type;
|
|
4885
5033
|
|
|
4886
5034
|
const llm_build_cb & cb;
|
|
4887
5035
|
|
|
@@ -4903,6 +5051,7 @@ struct llm_build_context {
|
|
|
4903
5051
|
kv_self (lctx.kv_self),
|
|
4904
5052
|
n_embd (hparams.n_embd),
|
|
4905
5053
|
n_layer (hparams.n_layer),
|
|
5054
|
+
n_rot (hparams.n_rot),
|
|
4906
5055
|
n_ctx (cparams.n_ctx),
|
|
4907
5056
|
n_head (hparams.n_head),
|
|
4908
5057
|
n_head_kv (hparams.n_head_kv),
|
|
@@ -4924,8 +5073,8 @@ struct llm_build_context {
|
|
|
4924
5073
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
|
4925
5074
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
|
4926
5075
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
|
4927
|
-
|
|
4928
|
-
|
|
5076
|
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
|
|
5077
|
+
rope_type (hparams.rope_type),
|
|
4929
5078
|
cb (cb),
|
|
4930
5079
|
buf_compute_meta (lctx.buf_compute_meta) {
|
|
4931
5080
|
// all initializations should be done in init()
|
|
@@ -4948,6 +5097,76 @@ struct llm_build_context {
|
|
|
4948
5097
|
}
|
|
4949
5098
|
}
|
|
4950
5099
|
|
|
5100
|
+
struct ggml_cgraph * build_k_shift() {
|
|
5101
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
5102
|
+
|
|
5103
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
5104
|
+
struct ggml_tensor * tmp =
|
|
5105
|
+
// we rotate only the first n_rot dimensions
|
|
5106
|
+
ggml_rope_custom_inplace(ctx0,
|
|
5107
|
+
ggml_view_3d(ctx0, kv_self.k_l[il],
|
|
5108
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
|
5109
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
|
5110
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
5111
|
+
0),
|
|
5112
|
+
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5113
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5114
|
+
cb(tmp, "K_shifted", il);
|
|
5115
|
+
ggml_build_forward_expand(gf, tmp);
|
|
5116
|
+
}
|
|
5117
|
+
|
|
5118
|
+
return gf;
|
|
5119
|
+
}
|
|
5120
|
+
|
|
5121
|
+
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
|
5122
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
5123
|
+
|
|
5124
|
+
for (uint32_t i = 0; i < ids.size(); ++i) {
|
|
5125
|
+
const uint32_t id = ids[i];
|
|
5126
|
+
|
|
5127
|
+
if (i == id || id == ids.size()) {
|
|
5128
|
+
continue;
|
|
5129
|
+
}
|
|
5130
|
+
|
|
5131
|
+
uint32_t nm = 1;
|
|
5132
|
+
|
|
5133
|
+
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
|
5134
|
+
nm++;
|
|
5135
|
+
}
|
|
5136
|
+
|
|
5137
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
5138
|
+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
|
5139
|
+
n_embd_k_gqa, nm,
|
|
5140
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
5141
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
|
5142
|
+
|
|
5143
|
+
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
|
5144
|
+
n_embd_k_gqa, nm,
|
|
5145
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
5146
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
|
5147
|
+
|
|
5148
|
+
ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
|
5149
|
+
nm, n_embd_v_gqa,
|
|
5150
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
|
5151
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
|
5152
|
+
|
|
5153
|
+
ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
|
5154
|
+
nm, n_embd_v_gqa,
|
|
5155
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
|
5156
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
|
5157
|
+
|
|
5158
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
|
5159
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
|
5160
|
+
}
|
|
5161
|
+
|
|
5162
|
+
i += nm - 1;
|
|
5163
|
+
}
|
|
5164
|
+
|
|
5165
|
+
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
|
5166
|
+
|
|
5167
|
+
return gf;
|
|
5168
|
+
}
|
|
5169
|
+
|
|
4951
5170
|
struct ggml_cgraph * build_llama() {
|
|
4952
5171
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
4953
5172
|
|
|
@@ -4969,11 +5188,6 @@ struct llm_build_context {
|
|
|
4969
5188
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
4970
5189
|
cb(KQ_mask, "KQ_mask", -1);
|
|
4971
5190
|
|
|
4972
|
-
// shift the entire K-cache if needed
|
|
4973
|
-
if (do_rope_shift) {
|
|
4974
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
4975
|
-
}
|
|
4976
|
-
|
|
4977
5191
|
for (int il = 0; il < n_layer; ++il) {
|
|
4978
5192
|
struct ggml_tensor * inpSA = inpL;
|
|
4979
5193
|
|
|
@@ -5008,22 +5222,22 @@ struct llm_build_context {
|
|
|
5008
5222
|
}
|
|
5009
5223
|
|
|
5010
5224
|
Qcur = ggml_rope_custom(
|
|
5011
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
|
5012
|
-
|
|
5225
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
5226
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5013
5227
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
5014
5228
|
);
|
|
5015
5229
|
cb(Qcur, "Qcur", il);
|
|
5016
5230
|
|
|
5017
5231
|
Kcur = ggml_rope_custom(
|
|
5018
5232
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
5019
|
-
|
|
5233
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5020
5234
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
5021
5235
|
);
|
|
5022
5236
|
cb(Kcur, "Kcur", il);
|
|
5023
5237
|
|
|
5024
5238
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5025
5239
|
model.layers[il].wo, model.layers[il].bo,
|
|
5026
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
5240
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5027
5241
|
cb(cur, "kqv_out", il);
|
|
5028
5242
|
}
|
|
5029
5243
|
|
|
@@ -5153,10 +5367,9 @@ struct llm_build_context {
|
|
|
5153
5367
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
5154
5368
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5155
5369
|
|
|
5156
|
-
//
|
|
5157
|
-
|
|
5158
|
-
|
|
5159
|
-
}
|
|
5370
|
+
// positions of the tokens in the KV cache
|
|
5371
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
|
5372
|
+
cb(KQ_pos, "KQ_pos", -1);
|
|
5160
5373
|
|
|
5161
5374
|
for (int il = 0; il < n_layer; ++il) {
|
|
5162
5375
|
struct ggml_tensor * inpSA = inpL;
|
|
@@ -5181,12 +5394,12 @@ struct llm_build_context {
|
|
|
5181
5394
|
case MODEL_7B:
|
|
5182
5395
|
Qcur = ggml_rope_custom(
|
|
5183
5396
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
5184
|
-
|
|
5397
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5185
5398
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
5186
5399
|
);
|
|
5187
5400
|
Kcur = ggml_rope_custom(
|
|
5188
5401
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
5189
|
-
|
|
5402
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5190
5403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
5191
5404
|
);
|
|
5192
5405
|
break;
|
|
@@ -5201,12 +5414,9 @@ struct llm_build_context {
|
|
|
5201
5414
|
cb(Kcur, "Kcur", il);
|
|
5202
5415
|
|
|
5203
5416
|
|
|
5204
|
-
// apply ALiBi for 13B model
|
|
5205
|
-
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
|
5206
|
-
|
|
5207
5417
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5208
5418
|
model.layers[il].wo, NULL,
|
|
5209
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
5419
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5210
5420
|
cb(cur, "kqv_out", il);
|
|
5211
5421
|
}
|
|
5212
5422
|
|
|
@@ -5274,11 +5484,6 @@ struct llm_build_context {
|
|
|
5274
5484
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
5275
5485
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5276
5486
|
|
|
5277
|
-
// shift the entire K-cache if needed
|
|
5278
|
-
if (do_rope_shift) {
|
|
5279
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
5280
|
-
}
|
|
5281
|
-
|
|
5282
5487
|
for (int il = 0; il < n_layer; ++il) {
|
|
5283
5488
|
struct ggml_tensor * attn_norm;
|
|
5284
5489
|
|
|
@@ -5317,20 +5522,20 @@ struct llm_build_context {
|
|
|
5317
5522
|
|
|
5318
5523
|
// using mode = 2 for neox mode
|
|
5319
5524
|
Qcur = ggml_rope_custom(
|
|
5320
|
-
ctx0, Qcur, inp_pos,
|
|
5525
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
5321
5526
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5322
5527
|
);
|
|
5323
5528
|
cb(Qcur, "Qcur", il);
|
|
5324
5529
|
|
|
5325
5530
|
Kcur = ggml_rope_custom(
|
|
5326
|
-
ctx0, Kcur, inp_pos,
|
|
5531
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
5327
5532
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5328
5533
|
);
|
|
5329
5534
|
cb(Kcur, "Kcur", il);
|
|
5330
5535
|
|
|
5331
5536
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5332
5537
|
model.layers[il].wo, NULL,
|
|
5333
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
5538
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5334
5539
|
cb(cur, "kqv_out", il);
|
|
5335
5540
|
}
|
|
5336
5541
|
|
|
@@ -5429,7 +5634,7 @@ struct llm_build_context {
|
|
|
5429
5634
|
|
|
5430
5635
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5431
5636
|
model.layers[il].wo, model.layers[il].bo,
|
|
5432
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
5637
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5433
5638
|
cb(cur, "kqv_out", il);
|
|
5434
5639
|
}
|
|
5435
5640
|
|
|
@@ -5493,10 +5698,6 @@ struct llm_build_context {
|
|
|
5493
5698
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
5494
5699
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5495
5700
|
|
|
5496
|
-
if (do_rope_shift) {
|
|
5497
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
5498
|
-
}
|
|
5499
|
-
|
|
5500
5701
|
for (int il = 0; il < n_layer; ++il) {
|
|
5501
5702
|
struct ggml_tensor * residual = inpL;
|
|
5502
5703
|
|
|
@@ -5554,7 +5755,7 @@ struct llm_build_context {
|
|
|
5554
5755
|
|
|
5555
5756
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
|
5556
5757
|
struct ggml_tensor * qrot = ggml_view_3d(
|
|
5557
|
-
ctx0, tmpq,
|
|
5758
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
5558
5759
|
ggml_element_size(tmpq) * n_embd_head,
|
|
5559
5760
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
5560
5761
|
0
|
|
@@ -5562,7 +5763,7 @@ struct llm_build_context {
|
|
|
5562
5763
|
cb(qrot, "qrot", il);
|
|
5563
5764
|
|
|
5564
5765
|
struct ggml_tensor * krot = ggml_view_3d(
|
|
5565
|
-
ctx0, tmpk,
|
|
5766
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
5566
5767
|
ggml_element_size(tmpk) * n_embd_head,
|
|
5567
5768
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
5568
5769
|
0
|
|
@@ -5571,29 +5772,29 @@ struct llm_build_context {
|
|
|
5571
5772
|
|
|
5572
5773
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
|
5573
5774
|
struct ggml_tensor * qpass = ggml_view_3d(
|
|
5574
|
-
ctx0, tmpq,
|
|
5775
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
5575
5776
|
ggml_element_size(tmpq) * n_embd_head,
|
|
5576
5777
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
5577
|
-
ggml_element_size(tmpq) *
|
|
5778
|
+
ggml_element_size(tmpq) * n_rot
|
|
5578
5779
|
);
|
|
5579
5780
|
cb(qpass, "qpass", il);
|
|
5580
5781
|
|
|
5581
5782
|
struct ggml_tensor * kpass = ggml_view_3d(
|
|
5582
|
-
ctx0, tmpk,
|
|
5783
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
5583
5784
|
ggml_element_size(tmpk) * n_embd_head,
|
|
5584
5785
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
5585
|
-
ggml_element_size(tmpk) *
|
|
5786
|
+
ggml_element_size(tmpk) * n_rot
|
|
5586
5787
|
);
|
|
5587
5788
|
cb(kpass, "kpass", il);
|
|
5588
5789
|
|
|
5589
5790
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
5590
|
-
ctx0, qrot, inp_pos,
|
|
5791
|
+
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
5591
5792
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5592
5793
|
);
|
|
5593
5794
|
cb(qrotated, "qrotated", il);
|
|
5594
5795
|
|
|
5595
5796
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
5596
|
-
ctx0, krot, inp_pos,
|
|
5797
|
+
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
5597
5798
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5598
5799
|
);
|
|
5599
5800
|
cb(krotated, "krotated", il);
|
|
@@ -5634,7 +5835,7 @@ struct llm_build_context {
|
|
|
5634
5835
|
|
|
5635
5836
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5636
5837
|
model.layers[il].wo, model.layers[il].bo,
|
|
5637
|
-
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
5838
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5638
5839
|
cb(cur, "kqv_out", il);
|
|
5639
5840
|
}
|
|
5640
5841
|
|
|
@@ -5696,6 +5897,10 @@ struct llm_build_context {
|
|
|
5696
5897
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
5697
5898
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5698
5899
|
|
|
5900
|
+
// positions of the tokens in the KV cache
|
|
5901
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
|
5902
|
+
cb(KQ_pos, "KQ_pos", -1);
|
|
5903
|
+
|
|
5699
5904
|
for (int il = 0; il < n_layer; ++il) {
|
|
5700
5905
|
struct ggml_tensor * inpSA = inpL;
|
|
5701
5906
|
|
|
@@ -5723,7 +5928,7 @@ struct llm_build_context {
|
|
|
5723
5928
|
|
|
5724
5929
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5725
5930
|
model.layers[il].wo, NULL,
|
|
5726
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
5931
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5727
5932
|
cb(cur, "kqv_out", il);
|
|
5728
5933
|
}
|
|
5729
5934
|
|
|
@@ -5773,6 +5978,7 @@ struct llm_build_context {
|
|
|
5773
5978
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
5774
5979
|
|
|
5775
5980
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5981
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5776
5982
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5777
5983
|
|
|
5778
5984
|
struct ggml_tensor * cur;
|
|
@@ -5781,7 +5987,8 @@ struct llm_build_context {
|
|
|
5781
5987
|
// get input vectors with right size
|
|
5782
5988
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
|
5783
5989
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
|
5784
|
-
struct ggml_tensor *
|
|
5990
|
+
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
|
5991
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
|
5785
5992
|
|
|
5786
5993
|
// construct input embeddings (token, type, position)
|
|
5787
5994
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
|
@@ -5789,7 +5996,9 @@ struct llm_build_context {
|
|
|
5789
5996
|
// token types are hardcoded to zero ("Sentence A")
|
|
5790
5997
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
|
5791
5998
|
inpL = ggml_add(ctx0, inpL, type_row0);
|
|
5792
|
-
|
|
5999
|
+
if (model.arch == LLM_ARCH_BERT) {
|
|
6000
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
|
6001
|
+
}
|
|
5793
6002
|
cb(inpL, "inp_embd", -1);
|
|
5794
6003
|
|
|
5795
6004
|
// embed layer norm
|
|
@@ -5805,7 +6014,7 @@ struct llm_build_context {
|
|
|
5805
6014
|
struct ggml_tensor * cur = inpL;
|
|
5806
6015
|
|
|
5807
6016
|
// self-attention
|
|
5808
|
-
{
|
|
6017
|
+
if (model.arch == LLM_ARCH_BERT) {
|
|
5809
6018
|
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
|
5810
6019
|
cb(Qcur, "Qcur", il);
|
|
5811
6020
|
|
|
@@ -5820,7 +6029,38 @@ struct llm_build_context {
|
|
|
5820
6029
|
|
|
5821
6030
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5822
6031
|
model.layers[il].wo, model.layers[il].bo,
|
|
5823
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6032
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6033
|
+
cb(cur, "kqv_out", il);
|
|
6034
|
+
} else {
|
|
6035
|
+
// compute Q and K and RoPE them
|
|
6036
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
|
6037
|
+
cb(cur, "wqkv", il);
|
|
6038
|
+
|
|
6039
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6040
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6041
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6042
|
+
|
|
6043
|
+
cb(Qcur, "Qcur", il);
|
|
6044
|
+
cb(Kcur, "Kcur", il);
|
|
6045
|
+
cb(Vcur, "Vcur", il);
|
|
6046
|
+
|
|
6047
|
+
Qcur = ggml_rope_custom(
|
|
6048
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
6049
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6050
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6051
|
+
);
|
|
6052
|
+
cb(Qcur, "Qcur", il);
|
|
6053
|
+
|
|
6054
|
+
Kcur = ggml_rope_custom(
|
|
6055
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
6056
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6057
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6058
|
+
);
|
|
6059
|
+
cb(Kcur, "Kcur", il);
|
|
6060
|
+
|
|
6061
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6062
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
6063
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5824
6064
|
cb(cur, "kqv_out", il);
|
|
5825
6065
|
}
|
|
5826
6066
|
|
|
@@ -5828,25 +6068,34 @@ struct llm_build_context {
|
|
|
5828
6068
|
cur = ggml_add(ctx0, cur, inpL);
|
|
5829
6069
|
|
|
5830
6070
|
// attention layer norm
|
|
5831
|
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
|
6071
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
|
5832
6072
|
|
|
5833
6073
|
struct ggml_tensor * ffn_inp = cur;
|
|
5834
6074
|
cb(ffn_inp, "ffn_inp", il);
|
|
5835
6075
|
|
|
5836
6076
|
// feed-forward network
|
|
5837
|
-
|
|
5838
|
-
|
|
5839
|
-
|
|
5840
|
-
|
|
5841
|
-
|
|
5842
|
-
|
|
6077
|
+
if (model.arch == LLM_ARCH_BERT) {
|
|
6078
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
6079
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
6080
|
+
NULL, NULL,
|
|
6081
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
6082
|
+
NULL,
|
|
6083
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
6084
|
+
} else {
|
|
6085
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
6086
|
+
model.layers[il].ffn_up, NULL,
|
|
6087
|
+
model.layers[il].ffn_gate, NULL,
|
|
6088
|
+
model.layers[il].ffn_down, NULL,
|
|
6089
|
+
NULL,
|
|
6090
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
6091
|
+
}
|
|
5843
6092
|
cb(cur, "ffn_out", il);
|
|
5844
6093
|
|
|
5845
6094
|
// attentions bypass the intermediate layer
|
|
5846
6095
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
5847
6096
|
|
|
5848
6097
|
// output layer norm
|
|
5849
|
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
|
6098
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
|
5850
6099
|
|
|
5851
6100
|
// input for next layer
|
|
5852
6101
|
inpL = cur;
|
|
@@ -5856,8 +6105,12 @@ struct llm_build_context {
|
|
|
5856
6105
|
cur = inpL;
|
|
5857
6106
|
|
|
5858
6107
|
// pooling layer
|
|
5859
|
-
if (
|
|
5860
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)),
|
|
6108
|
+
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
|
6109
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
|
6110
|
+
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
|
6111
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
|
6112
|
+
} else {
|
|
6113
|
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
|
5861
6114
|
}
|
|
5862
6115
|
cb(cur, "result_embd", -1);
|
|
5863
6116
|
|
|
@@ -5883,6 +6136,10 @@ struct llm_build_context {
|
|
|
5883
6136
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
5884
6137
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5885
6138
|
|
|
6139
|
+
// positions of the tokens in the KV cache
|
|
6140
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
|
6141
|
+
cb(KQ_pos, "KQ_pos", -1);
|
|
6142
|
+
|
|
5886
6143
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
|
5887
6144
|
model.tok_norm,
|
|
5888
6145
|
model.tok_norm_b,
|
|
@@ -5916,7 +6173,7 @@ struct llm_build_context {
|
|
|
5916
6173
|
|
|
5917
6174
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
5918
6175
|
model.layers[il].wo, model.layers[il].bo,
|
|
5919
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6176
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
5920
6177
|
cb(cur, "kqv_out", il);
|
|
5921
6178
|
}
|
|
5922
6179
|
|
|
@@ -5976,12 +6233,16 @@ struct llm_build_context {
|
|
|
5976
6233
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
5977
6234
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5978
6235
|
|
|
6236
|
+
// positions of the tokens in the KV cache
|
|
6237
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
|
6238
|
+
cb(KQ_pos, "KQ_pos", -1);
|
|
6239
|
+
|
|
5979
6240
|
for (int il = 0; il < n_layer; ++il) {
|
|
5980
6241
|
struct ggml_tensor * attn_norm;
|
|
5981
6242
|
|
|
5982
6243
|
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
|
5983
6244
|
model.layers[il].attn_norm,
|
|
5984
|
-
|
|
6245
|
+
model.layers[il].attn_norm_b,
|
|
5985
6246
|
LLM_NORM, cb, il);
|
|
5986
6247
|
cb(attn_norm, "attn_norm", il);
|
|
5987
6248
|
|
|
@@ -5992,6 +6253,11 @@ struct llm_build_context {
|
|
|
5992
6253
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
|
5993
6254
|
cb(cur, "wqkv", il);
|
|
5994
6255
|
|
|
6256
|
+
if (model.layers[il].bqkv){
|
|
6257
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6258
|
+
cb(cur, "bqkv", il);
|
|
6259
|
+
}
|
|
6260
|
+
|
|
5995
6261
|
if (hparams.f_clamp_kqv > 0.0f) {
|
|
5996
6262
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
5997
6263
|
cb(cur, "wqkv_clamped", il);
|
|
@@ -6008,8 +6274,8 @@ struct llm_build_context {
|
|
|
6008
6274
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6009
6275
|
|
|
6010
6276
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6011
|
-
model.layers[il].wo,
|
|
6012
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6277
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
6278
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6013
6279
|
cb(cur, "kqv_out", il);
|
|
6014
6280
|
}
|
|
6015
6281
|
|
|
@@ -6021,13 +6287,13 @@ struct llm_build_context {
|
|
|
6021
6287
|
{
|
|
6022
6288
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
6023
6289
|
model.layers[il].ffn_norm,
|
|
6024
|
-
|
|
6290
|
+
model.layers[il].ffn_norm_b,
|
|
6025
6291
|
LLM_NORM, cb, il);
|
|
6026
6292
|
cb(cur, "ffn_norm", il);
|
|
6027
6293
|
cur = llm_build_ffn(ctx0, cur,
|
|
6028
|
-
model.layers[il].ffn_up,
|
|
6294
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
6029
6295
|
NULL, NULL,
|
|
6030
|
-
model.layers[il].ffn_down,
|
|
6296
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
6031
6297
|
model.layers[il].ffn_act,
|
|
6032
6298
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
6033
6299
|
cb(cur, "ffn_out", il);
|
|
@@ -6044,7 +6310,7 @@ struct llm_build_context {
|
|
|
6044
6310
|
|
|
6045
6311
|
cur = llm_build_norm(ctx0, cur, hparams,
|
|
6046
6312
|
model.output_norm,
|
|
6047
|
-
|
|
6313
|
+
model.output_norm_b,
|
|
6048
6314
|
LLM_NORM, cb, -1);
|
|
6049
6315
|
cb(cur, "result_norm", -1);
|
|
6050
6316
|
|
|
@@ -6076,11 +6342,6 @@ struct llm_build_context {
|
|
|
6076
6342
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6077
6343
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6078
6344
|
|
|
6079
|
-
// shift the entire K-cache if needed
|
|
6080
|
-
if (do_rope_shift) {
|
|
6081
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
6082
|
-
}
|
|
6083
|
-
|
|
6084
6345
|
for (int il = 0; il < n_layer; ++il) {
|
|
6085
6346
|
struct ggml_tensor * inpSA = inpL;
|
|
6086
6347
|
|
|
@@ -6117,21 +6378,21 @@ struct llm_build_context {
|
|
|
6117
6378
|
|
|
6118
6379
|
Qcur = ggml_rope_custom(
|
|
6119
6380
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
6120
|
-
|
|
6381
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6121
6382
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6122
6383
|
);
|
|
6123
6384
|
cb(Qcur, "Qcur", il);
|
|
6124
6385
|
|
|
6125
6386
|
Kcur = ggml_rope_custom(
|
|
6126
6387
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
6127
|
-
|
|
6388
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6128
6389
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6129
6390
|
);
|
|
6130
6391
|
cb(Kcur, "Kcur", il);
|
|
6131
6392
|
|
|
6132
6393
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6133
6394
|
model.layers[il].wo, NULL,
|
|
6134
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6395
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6135
6396
|
cb(cur, "kqv_out", il);
|
|
6136
6397
|
}
|
|
6137
6398
|
|
|
@@ -6199,11 +6460,6 @@ struct llm_build_context {
|
|
|
6199
6460
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6200
6461
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6201
6462
|
|
|
6202
|
-
// shift the entire K-cache if needed
|
|
6203
|
-
if (do_rope_shift) {
|
|
6204
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
6205
|
-
}
|
|
6206
|
-
|
|
6207
6463
|
for (int il = 0; il < n_layer; ++il) {
|
|
6208
6464
|
struct ggml_tensor * inpSA = inpL;
|
|
6209
6465
|
|
|
@@ -6233,20 +6489,20 @@ struct llm_build_context {
|
|
|
6233
6489
|
|
|
6234
6490
|
// using mode = 2 for neox mode
|
|
6235
6491
|
Qcur = ggml_rope_custom(
|
|
6236
|
-
ctx0, Qcur, inp_pos,
|
|
6492
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
6237
6493
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
6238
6494
|
);
|
|
6239
6495
|
cb(Qcur, "Qcur", il);
|
|
6240
6496
|
|
|
6241
6497
|
Kcur = ggml_rope_custom(
|
|
6242
|
-
ctx0, Kcur, inp_pos,
|
|
6498
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
6243
6499
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
6244
6500
|
);
|
|
6245
6501
|
cb(Kcur, "Kcur", il);
|
|
6246
6502
|
|
|
6247
6503
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6248
6504
|
model.layers[il].wo, NULL,
|
|
6249
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6505
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6250
6506
|
cb(cur, "kqv_out", il);
|
|
6251
6507
|
}
|
|
6252
6508
|
|
|
@@ -6313,11 +6569,6 @@ struct llm_build_context {
|
|
|
6313
6569
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6314
6570
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6315
6571
|
|
|
6316
|
-
// shift the entire K-cache if needed
|
|
6317
|
-
if (do_rope_shift) {
|
|
6318
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
6319
|
-
}
|
|
6320
|
-
|
|
6321
6572
|
for (int il = 0; il < n_layer; ++il) {
|
|
6322
6573
|
struct ggml_tensor * inpSA = inpL;
|
|
6323
6574
|
|
|
@@ -6353,21 +6604,21 @@ struct llm_build_context {
|
|
|
6353
6604
|
|
|
6354
6605
|
Qcur = ggml_rope_custom(
|
|
6355
6606
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
6356
|
-
|
|
6607
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6357
6608
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6358
6609
|
);
|
|
6359
6610
|
cb(Qcur, "Qcur", il);
|
|
6360
6611
|
|
|
6361
6612
|
Kcur = ggml_rope_custom(
|
|
6362
6613
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
6363
|
-
|
|
6614
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6364
6615
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6365
6616
|
);
|
|
6366
6617
|
cb(Kcur, "Kcur", il);
|
|
6367
6618
|
|
|
6368
6619
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6369
6620
|
model.layers[il].wo, model.layers[il].bo,
|
|
6370
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6621
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6371
6622
|
cb(cur, "kqv_out", il);
|
|
6372
6623
|
}
|
|
6373
6624
|
|
|
@@ -6434,11 +6685,6 @@ struct llm_build_context {
|
|
|
6434
6685
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6435
6686
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6436
6687
|
|
|
6437
|
-
// shift the entire K-cache if needed
|
|
6438
|
-
if (do_rope_shift) {
|
|
6439
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
6440
|
-
}
|
|
6441
|
-
|
|
6442
6688
|
for (int il = 0; il < n_layer; ++il) {
|
|
6443
6689
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
|
6444
6690
|
model.layers[il].attn_norm,
|
|
@@ -6476,7 +6722,7 @@ struct llm_build_context {
|
|
|
6476
6722
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6477
6723
|
|
|
6478
6724
|
Qcur = ggml_rope_custom(
|
|
6479
|
-
ctx0, Qcur, inp_pos,
|
|
6725
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
6480
6726
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
6481
6727
|
);
|
|
6482
6728
|
cb(Qcur, "Qcur", il);
|
|
@@ -6487,14 +6733,14 @@ struct llm_build_context {
|
|
|
6487
6733
|
cb(Qcur, "Qcur", il);
|
|
6488
6734
|
|
|
6489
6735
|
Kcur = ggml_rope_custom(
|
|
6490
|
-
ctx0, Kcur, inp_pos,
|
|
6736
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
6491
6737
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
6492
6738
|
);
|
|
6493
6739
|
cb(Kcur, "Kcur", il);
|
|
6494
6740
|
|
|
6495
6741
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6496
6742
|
model.layers[il].wo, model.layers[il].bo,
|
|
6497
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6743
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
6498
6744
|
cb(cur, "kqv_out", il);
|
|
6499
6745
|
}
|
|
6500
6746
|
|
|
@@ -6556,11 +6802,6 @@ struct llm_build_context {
|
|
|
6556
6802
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6557
6803
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6558
6804
|
|
|
6559
|
-
// shift the entire K-cache if needed
|
|
6560
|
-
if (do_rope_shift) {
|
|
6561
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
6562
|
-
}
|
|
6563
|
-
|
|
6564
6805
|
for (int il = 0; il < n_layer; ++il) {
|
|
6565
6806
|
|
|
6566
6807
|
// norm
|
|
@@ -6584,20 +6825,20 @@ struct llm_build_context {
|
|
|
6584
6825
|
cb(Vcur, "Vcur", il);
|
|
6585
6826
|
|
|
6586
6827
|
Qcur = ggml_rope_custom(
|
|
6587
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
|
6588
|
-
n_embd_head,
|
|
6828
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
|
6829
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6589
6830
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
6590
6831
|
cb(Qcur, "Qcur", il);
|
|
6591
6832
|
|
|
6592
6833
|
Kcur = ggml_rope_custom(
|
|
6593
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
|
6594
|
-
n_embd_head,
|
|
6834
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
|
6835
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6595
6836
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
6596
6837
|
cb(Kcur, "Kcur", il);
|
|
6597
6838
|
|
|
6598
6839
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6599
6840
|
model.layers[il].wo, NULL,
|
|
6600
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6841
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6601
6842
|
cb(cur, "kqv_out", il);
|
|
6602
6843
|
}
|
|
6603
6844
|
struct ggml_tensor * sa_out = cur;
|
|
@@ -6696,7 +6937,7 @@ struct llm_build_context {
|
|
|
6696
6937
|
|
|
6697
6938
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6698
6939
|
model.layers[il].wo, model.layers[il].bo,
|
|
6699
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
6940
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6700
6941
|
cb(cur, "kqv_out", il);
|
|
6701
6942
|
}
|
|
6702
6943
|
|
|
@@ -6761,11 +7002,6 @@ struct llm_build_context {
|
|
|
6761
7002
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6762
7003
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6763
7004
|
|
|
6764
|
-
// shift the entire K-cache if needed
|
|
6765
|
-
if (do_rope_shift) {
|
|
6766
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
6767
|
-
}
|
|
6768
|
-
|
|
6769
7005
|
for (int il = 0; il < n_layer; ++il) {
|
|
6770
7006
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
6771
7007
|
model.layers[il].attn_norm,
|
|
@@ -6791,21 +7027,21 @@ struct llm_build_context {
|
|
|
6791
7027
|
|
|
6792
7028
|
struct ggml_tensor * Qcur = ggml_rope_custom(
|
|
6793
7029
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
|
6794
|
-
|
|
7030
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6795
7031
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6796
7032
|
);
|
|
6797
7033
|
cb(Qcur, "Qcur", il);
|
|
6798
7034
|
|
|
6799
7035
|
struct ggml_tensor * Kcur = ggml_rope_custom(
|
|
6800
7036
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
6801
|
-
|
|
7037
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6802
7038
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6803
7039
|
);
|
|
6804
7040
|
cb(Kcur, "Kcur", il);
|
|
6805
7041
|
|
|
6806
7042
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6807
7043
|
model.layers[il].wo, model.layers[il].bo,
|
|
6808
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
7044
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6809
7045
|
cb(cur, "kqv_out", il);
|
|
6810
7046
|
}
|
|
6811
7047
|
|
|
@@ -6869,11 +7105,6 @@ struct llm_build_context {
|
|
|
6869
7105
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6870
7106
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6871
7107
|
|
|
6872
|
-
// shift the entire K-cache if needed
|
|
6873
|
-
if (do_rope_shift) {
|
|
6874
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
6875
|
-
}
|
|
6876
|
-
|
|
6877
7108
|
for (int il = 0; il < n_layer; ++il) {
|
|
6878
7109
|
struct ggml_tensor * inpSA = inpL;
|
|
6879
7110
|
|
|
@@ -6909,21 +7140,21 @@ struct llm_build_context {
|
|
|
6909
7140
|
|
|
6910
7141
|
Qcur = ggml_rope_custom(
|
|
6911
7142
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
6912
|
-
|
|
7143
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6913
7144
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6914
7145
|
);
|
|
6915
7146
|
cb(Qcur, "Qcur", il);
|
|
6916
7147
|
|
|
6917
7148
|
Kcur = ggml_rope_custom(
|
|
6918
7149
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
6919
|
-
|
|
7150
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6920
7151
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6921
7152
|
);
|
|
6922
7153
|
cb(Kcur, "Kcur", il);
|
|
6923
7154
|
|
|
6924
7155
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6925
7156
|
model.layers[il].wo, NULL,
|
|
6926
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
7157
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6927
7158
|
cb(cur, "kqv_out", il);
|
|
6928
7159
|
}
|
|
6929
7160
|
|
|
@@ -6988,11 +7219,6 @@ struct llm_build_context {
|
|
|
6988
7219
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
6989
7220
|
cb(KQ_mask, "KQ_mask", -1);
|
|
6990
7221
|
|
|
6991
|
-
// shift the entire K-cache if needed
|
|
6992
|
-
if (do_rope_shift) {
|
|
6993
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
6994
|
-
}
|
|
6995
|
-
|
|
6996
7222
|
for (int il = 0; il < n_layer; ++il) {
|
|
6997
7223
|
struct ggml_tensor * inpSA = inpL;
|
|
6998
7224
|
|
|
@@ -7028,21 +7254,21 @@ struct llm_build_context {
|
|
|
7028
7254
|
|
|
7029
7255
|
Qcur = ggml_rope_custom(
|
|
7030
7256
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7031
|
-
|
|
7257
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7032
7258
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7033
7259
|
);
|
|
7034
7260
|
cb(Qcur, "Qcur", il);
|
|
7035
7261
|
|
|
7036
7262
|
Kcur = ggml_rope_custom(
|
|
7037
7263
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7038
|
-
|
|
7264
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7039
7265
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7040
7266
|
);
|
|
7041
7267
|
cb(Kcur, "Kcur", il);
|
|
7042
7268
|
|
|
7043
7269
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7044
7270
|
model.layers[il].wo, model.layers[il].bo,
|
|
7045
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
7271
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7046
7272
|
cb(cur, "kqv_out", il);
|
|
7047
7273
|
}
|
|
7048
7274
|
|
|
@@ -7120,11 +7346,6 @@ struct llm_build_context {
|
|
|
7120
7346
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
7121
7347
|
cb(KQ_mask, "KQ_mask", -1);
|
|
7122
7348
|
|
|
7123
|
-
// shift the entire K-cache if needed
|
|
7124
|
-
if (do_rope_shift) {
|
|
7125
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
7126
|
-
}
|
|
7127
|
-
|
|
7128
7349
|
for (int il = 0; il < n_layer; ++il) {
|
|
7129
7350
|
struct ggml_tensor * inpSA = inpL;
|
|
7130
7351
|
|
|
@@ -7160,21 +7381,21 @@ struct llm_build_context {
|
|
|
7160
7381
|
|
|
7161
7382
|
Qcur = ggml_rope_custom(
|
|
7162
7383
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7163
|
-
|
|
7384
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7164
7385
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7165
7386
|
);
|
|
7166
7387
|
cb(Qcur, "Qcur", il);
|
|
7167
7388
|
|
|
7168
7389
|
Kcur = ggml_rope_custom(
|
|
7169
7390
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7170
|
-
|
|
7391
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7171
7392
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7172
7393
|
);
|
|
7173
7394
|
cb(Kcur, "Kcur", il);
|
|
7174
7395
|
|
|
7175
7396
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7176
7397
|
model.layers[il].wo, model.layers[il].bo,
|
|
7177
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
7398
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7178
7399
|
cb(cur, "kqv_out", il);
|
|
7179
7400
|
}
|
|
7180
7401
|
|
|
@@ -7233,8 +7454,147 @@ struct llm_build_context {
|
|
|
7233
7454
|
|
|
7234
7455
|
return gf;
|
|
7235
7456
|
}
|
|
7457
|
+
|
|
7458
|
+
struct ggml_cgraph * build_gemma() {
|
|
7459
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
7460
|
+
|
|
7461
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
7462
|
+
|
|
7463
|
+
struct ggml_tensor * cur;
|
|
7464
|
+
struct ggml_tensor * inpL;
|
|
7465
|
+
|
|
7466
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
|
7467
|
+
cb(inpL, "inp_embd", -1);
|
|
7468
|
+
|
|
7469
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
7470
|
+
cb(inpL, "inp_scaled", -1);
|
|
7471
|
+
|
|
7472
|
+
// inp_pos - contains the positions
|
|
7473
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
|
7474
|
+
cb(inp_pos, "inp_pos", -1);
|
|
7475
|
+
|
|
7476
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7477
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
7478
|
+
cb(KQ_mask, "KQ_mask", -1);
|
|
7479
|
+
|
|
7480
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7481
|
+
|
|
7482
|
+
// norm
|
|
7483
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
7484
|
+
model.layers[il].attn_norm, NULL,
|
|
7485
|
+
LLM_NORM_RMS, cb, il);
|
|
7486
|
+
cb(cur, "attn_norm", il);
|
|
7487
|
+
|
|
7488
|
+
// self-attention
|
|
7489
|
+
{
|
|
7490
|
+
// compute Q and K and RoPE them
|
|
7491
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
7492
|
+
cb(Qcur, "Qcur", il);
|
|
7493
|
+
|
|
7494
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
7495
|
+
cb(Kcur, "Kcur", il);
|
|
7496
|
+
|
|
7497
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
7498
|
+
cb(Vcur, "Vcur", il);
|
|
7499
|
+
|
|
7500
|
+
Qcur = ggml_rope_custom(
|
|
7501
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
|
7502
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7503
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7504
|
+
cb(Qcur, "Qcur", il);
|
|
7505
|
+
|
|
7506
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
|
7507
|
+
cb(Qcur, "Qcur_scaled", il);
|
|
7508
|
+
|
|
7509
|
+
Kcur = ggml_rope_custom(
|
|
7510
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
|
7511
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7512
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7513
|
+
cb(Kcur, "Kcur", il);
|
|
7514
|
+
|
|
7515
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7516
|
+
model.layers[il].wo, NULL,
|
|
7517
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
7518
|
+
cb(cur, "kqv_out", il);
|
|
7519
|
+
}
|
|
7520
|
+
|
|
7521
|
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
7522
|
+
cb(sa_out, "sa_out", il);
|
|
7523
|
+
|
|
7524
|
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
|
7525
|
+
model.layers[il].ffn_norm, NULL,
|
|
7526
|
+
LLM_NORM_RMS, cb, il);
|
|
7527
|
+
cb(cur, "ffn_norm", il);
|
|
7528
|
+
|
|
7529
|
+
// feed-forward network
|
|
7530
|
+
{
|
|
7531
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
7532
|
+
model.layers[il].ffn_up, NULL,
|
|
7533
|
+
model.layers[il].ffn_gate, NULL,
|
|
7534
|
+
model.layers[il].ffn_down, NULL,
|
|
7535
|
+
NULL,
|
|
7536
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
|
7537
|
+
cb(cur, "ffn_out", il);
|
|
7538
|
+
}
|
|
7539
|
+
|
|
7540
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
7541
|
+
cb(cur, "l_out", il);
|
|
7542
|
+
|
|
7543
|
+
// input for next layer
|
|
7544
|
+
inpL = cur;
|
|
7545
|
+
}
|
|
7546
|
+
|
|
7547
|
+
cur = inpL;
|
|
7548
|
+
|
|
7549
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
7550
|
+
model.output_norm, NULL,
|
|
7551
|
+
LLM_NORM_RMS, cb, -1);
|
|
7552
|
+
cb(cur, "result_norm", -1);
|
|
7553
|
+
|
|
7554
|
+
// lm_head
|
|
7555
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
7556
|
+
cb(cur, "result_output", -1);
|
|
7557
|
+
|
|
7558
|
+
ggml_build_forward_expand(gf, cur);
|
|
7559
|
+
|
|
7560
|
+
return gf;
|
|
7561
|
+
}
|
|
7236
7562
|
};
|
|
7237
7563
|
|
|
7564
|
+
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
7565
|
+
llama_batch dummy;
|
|
7566
|
+
dummy.n_tokens = 0;
|
|
7567
|
+
|
|
7568
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
|
7569
|
+
|
|
7570
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
|
7571
|
+
|
|
7572
|
+
llm.init();
|
|
7573
|
+
|
|
7574
|
+
struct ggml_cgraph * result = llm.build_defrag(ids);
|
|
7575
|
+
|
|
7576
|
+
llm.free();
|
|
7577
|
+
|
|
7578
|
+
return result;
|
|
7579
|
+
}
|
|
7580
|
+
|
|
7581
|
+
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
|
7582
|
+
llama_batch dummy;
|
|
7583
|
+
dummy.n_tokens = 0;
|
|
7584
|
+
|
|
7585
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
|
7586
|
+
|
|
7587
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
|
7588
|
+
|
|
7589
|
+
llm.init();
|
|
7590
|
+
|
|
7591
|
+
struct ggml_cgraph * result = llm.build_k_shift();
|
|
7592
|
+
|
|
7593
|
+
llm.free();
|
|
7594
|
+
|
|
7595
|
+
return result;
|
|
7596
|
+
}
|
|
7597
|
+
|
|
7238
7598
|
static struct ggml_cgraph * llama_build_graph(
|
|
7239
7599
|
llama_context & lctx,
|
|
7240
7600
|
const llama_batch & batch,
|
|
@@ -7289,6 +7649,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
7289
7649
|
result = llm.build_refact();
|
|
7290
7650
|
} break;
|
|
7291
7651
|
case LLM_ARCH_BERT:
|
|
7652
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
7292
7653
|
{
|
|
7293
7654
|
result = llm.build_bert();
|
|
7294
7655
|
} break;
|
|
@@ -7340,6 +7701,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
7340
7701
|
{
|
|
7341
7702
|
result = llm.build_minicpm();
|
|
7342
7703
|
} break;
|
|
7704
|
+
case LLM_ARCH_GEMMA:
|
|
7705
|
+
{
|
|
7706
|
+
result = llm.build_gemma();
|
|
7707
|
+
} break;
|
|
7343
7708
|
default:
|
|
7344
7709
|
GGML_ASSERT(false);
|
|
7345
7710
|
}
|
|
@@ -7349,6 +7714,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
7349
7714
|
return result;
|
|
7350
7715
|
}
|
|
7351
7716
|
|
|
7717
|
+
static void llama_set_k_shift(llama_context & lctx) {
|
|
7718
|
+
const auto & cparams = lctx.cparams;
|
|
7719
|
+
|
|
7720
|
+
const int64_t n_ctx = cparams.n_ctx;
|
|
7721
|
+
|
|
7722
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
|
7723
|
+
|
|
7724
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
|
7725
|
+
|
|
7726
|
+
for (int i = 0; i < n_ctx; ++i) {
|
|
7727
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
|
7728
|
+
}
|
|
7729
|
+
}
|
|
7730
|
+
|
|
7352
7731
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7353
7732
|
//
|
|
7354
7733
|
// set input data
|
|
@@ -7404,42 +7783,90 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
7404
7783
|
}
|
|
7405
7784
|
}
|
|
7406
7785
|
|
|
7407
|
-
{
|
|
7408
|
-
|
|
7409
|
-
|
|
7786
|
+
if (hparams.need_kq_pos) {
|
|
7787
|
+
const int64_t n_kv = kv_self.n;
|
|
7788
|
+
|
|
7789
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
|
7790
|
+
|
|
7791
|
+
float * data = (float *) lctx.inp_KQ_pos->data;
|
|
7410
7792
|
|
|
7411
|
-
for (int i = 0; i <
|
|
7412
|
-
data[i] =
|
|
7793
|
+
for (int i = 0; i < n_kv; ++i) {
|
|
7794
|
+
data[i] = float(lctx.kv_self.cells[i].pos);
|
|
7413
7795
|
}
|
|
7414
7796
|
}
|
|
7415
7797
|
|
|
7416
|
-
if (
|
|
7417
|
-
const int64_t
|
|
7798
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
|
7799
|
+
const int64_t n_tokens = batch.n_tokens;
|
|
7800
|
+
|
|
7801
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
|
7802
|
+
float * data = (float *) lctx.inp_mean->data;
|
|
7803
|
+
|
|
7804
|
+
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
|
7418
7805
|
|
|
7419
|
-
|
|
7806
|
+
std::vector<uint64_t> sum(n_tokens, 0);
|
|
7807
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
7808
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
7809
|
+
sum[seq_id] += 1;
|
|
7810
|
+
}
|
|
7420
7811
|
|
|
7421
|
-
|
|
7812
|
+
std::vector<float> div(n_tokens, 0.0f);
|
|
7813
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
7814
|
+
const uint64_t s = sum[i];
|
|
7815
|
+
if (s > 0) {
|
|
7816
|
+
div[i] = 1.0f/float(s);
|
|
7817
|
+
}
|
|
7818
|
+
}
|
|
7422
7819
|
|
|
7423
|
-
for (int i = 0; i <
|
|
7424
|
-
|
|
7820
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
7821
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
7822
|
+
data[seq_id*n_tokens + i] = div[seq_id];
|
|
7425
7823
|
}
|
|
7426
7824
|
}
|
|
7427
7825
|
|
|
7428
|
-
if (
|
|
7826
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
|
7429
7827
|
const int64_t n_tokens = batch.n_tokens;
|
|
7430
7828
|
|
|
7431
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.
|
|
7432
|
-
|
|
7433
|
-
|
|
7434
|
-
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
|
7829
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
|
7830
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
|
7435
7831
|
|
|
7436
7832
|
for (int i = 0; i < n_tokens; ++i) {
|
|
7437
7833
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
7438
|
-
|
|
7834
|
+
const llama_pos pos = batch.pos[i];
|
|
7835
|
+
if (pos == 0) {
|
|
7836
|
+
data[seq_id] = i;
|
|
7837
|
+
}
|
|
7439
7838
|
}
|
|
7440
7839
|
}
|
|
7441
7840
|
}
|
|
7442
7841
|
|
|
7842
|
+
static void llama_graph_compute(
|
|
7843
|
+
llama_context & lctx,
|
|
7844
|
+
ggml_cgraph * gf,
|
|
7845
|
+
int n_threads) {
|
|
7846
|
+
#ifdef GGML_USE_MPI
|
|
7847
|
+
const int64_t n_layer = lctx.model.hparams.n_layer;
|
|
7848
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
7849
|
+
#endif
|
|
7850
|
+
|
|
7851
|
+
#ifdef GGML_USE_METAL
|
|
7852
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
7853
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
7854
|
+
}
|
|
7855
|
+
#endif
|
|
7856
|
+
|
|
7857
|
+
if (lctx.backend_cpu != nullptr) {
|
|
7858
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
7859
|
+
}
|
|
7860
|
+
|
|
7861
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
|
7862
|
+
|
|
7863
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
7864
|
+
|
|
7865
|
+
#ifdef GGML_USE_MPI
|
|
7866
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
7867
|
+
#endif
|
|
7868
|
+
}
|
|
7869
|
+
|
|
7443
7870
|
// decode a batch of tokens by evaluating the transformer
|
|
7444
7871
|
//
|
|
7445
7872
|
// - lctx: llama context
|
|
@@ -7466,9 +7893,9 @@ static int llama_decode_internal(
|
|
|
7466
7893
|
const auto n_batch = cparams.n_batch;
|
|
7467
7894
|
|
|
7468
7895
|
GGML_ASSERT(n_tokens <= n_batch);
|
|
7896
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
|
7469
7897
|
|
|
7470
7898
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
|
7471
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
|
7472
7899
|
|
|
7473
7900
|
const int64_t t_start_us = ggml_time_us();
|
|
7474
7901
|
|
|
@@ -7517,6 +7944,8 @@ static int llama_decode_internal(
|
|
|
7517
7944
|
batch.seq_id = seq_id_arr.data();
|
|
7518
7945
|
}
|
|
7519
7946
|
|
|
7947
|
+
llama_kv_cache_update(&lctx);
|
|
7948
|
+
|
|
7520
7949
|
// if we have enough unused cells before the current head ->
|
|
7521
7950
|
// better to start searching from the beginning of the cache, hoping to fill it
|
|
7522
7951
|
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
|
@@ -7541,8 +7970,9 @@ static int llama_decode_internal(
|
|
|
7541
7970
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
|
7542
7971
|
|
|
7543
7972
|
// the output is always the last tensor in the graph
|
|
7544
|
-
struct ggml_tensor * res
|
|
7973
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
7545
7974
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
|
7975
|
+
|
|
7546
7976
|
if (strcmp(res->name, "result_output") == 0) {
|
|
7547
7977
|
// the embeddings could be the second to last tensor, or the third to last tensor
|
|
7548
7978
|
if (strcmp(embeddings->name, "result_norm") != 0) {
|
|
@@ -7569,40 +7999,12 @@ static int llama_decode_internal(
|
|
|
7569
7999
|
n_threads = std::min(4, n_threads);
|
|
7570
8000
|
}
|
|
7571
8001
|
|
|
7572
|
-
#ifdef GGML_USE_MPI
|
|
7573
|
-
const int64_t n_layer = hparams.n_layer;
|
|
7574
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
7575
|
-
#endif
|
|
7576
|
-
|
|
7577
|
-
#ifdef GGML_USE_METAL
|
|
7578
|
-
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
7579
|
-
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
7580
|
-
}
|
|
7581
|
-
#endif
|
|
7582
|
-
|
|
7583
|
-
if (lctx.backend_cpu != nullptr) {
|
|
7584
|
-
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
7585
|
-
}
|
|
7586
|
-
|
|
7587
8002
|
llama_set_inputs(lctx, batch);
|
|
7588
8003
|
|
|
7589
|
-
|
|
7590
|
-
|
|
7591
|
-
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
7592
|
-
|
|
7593
|
-
#ifdef GGML_USE_MPI
|
|
7594
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
7595
|
-
#endif
|
|
8004
|
+
llama_graph_compute(lctx, gf, n_threads);
|
|
7596
8005
|
|
|
7597
8006
|
// update the kv ring buffer
|
|
7598
8007
|
{
|
|
7599
|
-
if (kv_self.has_shift) {
|
|
7600
|
-
kv_self.has_shift = false;
|
|
7601
|
-
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
|
7602
|
-
kv_self.cells[i].delta = 0;
|
|
7603
|
-
}
|
|
7604
|
-
}
|
|
7605
|
-
|
|
7606
8008
|
kv_self.head += n_tokens;
|
|
7607
8009
|
|
|
7608
8010
|
// Ensure kv cache head points to a valid index.
|
|
@@ -7611,91 +8013,342 @@ static int llama_decode_internal(
|
|
|
7611
8013
|
}
|
|
7612
8014
|
}
|
|
7613
8015
|
|
|
8016
|
+
// decide if we need to defrag the kv cache
|
|
8017
|
+
if (cparams.defrag_thold >= 0.0f) {
|
|
8018
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
|
8019
|
+
|
|
8020
|
+
// queue defragmentation for next llama_kv_cache_update
|
|
8021
|
+
if (fragmentation > cparams.defrag_thold) {
|
|
8022
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
|
8023
|
+
|
|
8024
|
+
llama_kv_cache_defrag(kv_self);
|
|
8025
|
+
}
|
|
8026
|
+
}
|
|
8027
|
+
|
|
7614
8028
|
#ifdef GGML_PERF
|
|
7615
8029
|
// print timing information per ggml operation (for debugging purposes)
|
|
7616
8030
|
// requires GGML_PERF to be defined
|
|
7617
8031
|
ggml_graph_print(gf);
|
|
7618
8032
|
#endif
|
|
7619
8033
|
|
|
7620
|
-
// plot the computation graph in dot format (for debugging purposes)
|
|
7621
|
-
//if (n_past%100 == 0) {
|
|
7622
|
-
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
|
7623
|
-
//}
|
|
8034
|
+
// plot the computation graph in dot format (for debugging purposes)
|
|
8035
|
+
//if (n_past%100 == 0) {
|
|
8036
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
|
8037
|
+
//}
|
|
8038
|
+
|
|
8039
|
+
// extract logits
|
|
8040
|
+
// TODO: do not compute and extract logits if only embeddings are needed
|
|
8041
|
+
// need to update the graphs to skip "result_output"
|
|
8042
|
+
if (res) {
|
|
8043
|
+
auto & logits_out = lctx.logits;
|
|
8044
|
+
|
|
8045
|
+
#ifndef NDEBUG
|
|
8046
|
+
auto & logits_valid = lctx.logits_valid;
|
|
8047
|
+
logits_valid.clear();
|
|
8048
|
+
logits_valid.resize(n_tokens);
|
|
8049
|
+
|
|
8050
|
+
logits_out.clear();
|
|
8051
|
+
#endif
|
|
8052
|
+
|
|
8053
|
+
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
|
8054
|
+
GGML_ASSERT(res_backend != nullptr);
|
|
8055
|
+
if (batch.logits) {
|
|
8056
|
+
logits_out.resize(n_vocab * n_tokens);
|
|
8057
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
8058
|
+
if (batch.logits[i] == 0) {
|
|
8059
|
+
continue;
|
|
8060
|
+
}
|
|
8061
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
|
8062
|
+
#ifndef NDEBUG
|
|
8063
|
+
logits_valid[i] = true;
|
|
8064
|
+
#endif
|
|
8065
|
+
}
|
|
8066
|
+
} else if (lctx.logits_all) {
|
|
8067
|
+
logits_out.resize(n_vocab * n_tokens);
|
|
8068
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
|
8069
|
+
#ifndef NDEBUG
|
|
8070
|
+
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
|
8071
|
+
#endif
|
|
8072
|
+
} else {
|
|
8073
|
+
logits_out.resize(n_vocab);
|
|
8074
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
|
8075
|
+
#ifndef NDEBUG
|
|
8076
|
+
logits_valid[0] = true;
|
|
8077
|
+
#endif
|
|
8078
|
+
}
|
|
8079
|
+
ggml_backend_synchronize(res_backend);
|
|
8080
|
+
}
|
|
8081
|
+
|
|
8082
|
+
// extract embeddings
|
|
8083
|
+
if (!lctx.embedding.empty()) {
|
|
8084
|
+
auto & embedding_out = lctx.embedding;
|
|
8085
|
+
|
|
8086
|
+
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
|
8087
|
+
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
|
8088
|
+
|
|
8089
|
+
embedding_out.resize(embd_size);
|
|
8090
|
+
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
|
8091
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
|
8092
|
+
ggml_backend_synchronize(embeddings_backend);
|
|
8093
|
+
}
|
|
8094
|
+
|
|
8095
|
+
// measure the performance only for the single-token evals
|
|
8096
|
+
if (n_tokens == 1) {
|
|
8097
|
+
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
|
8098
|
+
lctx.n_eval++;
|
|
8099
|
+
}
|
|
8100
|
+
else if (n_tokens > 1) {
|
|
8101
|
+
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
|
8102
|
+
lctx.n_p_eval += n_tokens;
|
|
8103
|
+
}
|
|
8104
|
+
|
|
8105
|
+
// get a more accurate load time, upon first eval
|
|
8106
|
+
// TODO: fix this
|
|
8107
|
+
if (!lctx.has_evaluated_once) {
|
|
8108
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
|
8109
|
+
lctx.has_evaluated_once = true;
|
|
8110
|
+
}
|
|
8111
|
+
|
|
8112
|
+
return 0;
|
|
8113
|
+
}
|
|
8114
|
+
|
|
8115
|
+
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
|
8116
|
+
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8117
|
+
auto & kv_self = lctx.kv_self;
|
|
8118
|
+
|
|
8119
|
+
const auto & hparams = lctx.model.hparams;
|
|
8120
|
+
|
|
8121
|
+
const uint32_t n_layer = hparams.n_layer;
|
|
8122
|
+
|
|
8123
|
+
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
|
|
8124
|
+
const uint32_t n_used = kv_self.used;
|
|
8125
|
+
|
|
8126
|
+
assert(n_used <= n_kv);
|
|
8127
|
+
|
|
8128
|
+
//const int64_t t_start = ggml_time_us();
|
|
8129
|
+
|
|
8130
|
+
// number of cells moved
|
|
8131
|
+
uint32_t n_moves = 0;
|
|
8132
|
+
|
|
8133
|
+
// determine which KV cells to move where
|
|
8134
|
+
//
|
|
8135
|
+
// cell i moves to ids[i]
|
|
8136
|
+
//
|
|
8137
|
+
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
|
8138
|
+
//
|
|
8139
|
+
std::vector<uint32_t> ids(n_kv, n_kv);
|
|
8140
|
+
|
|
8141
|
+
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
|
8142
|
+
const auto & cell0 = kv_self.cells[i0];
|
|
8143
|
+
|
|
8144
|
+
if (!cell0.is_empty()) {
|
|
8145
|
+
ids[i0] = i0;
|
|
8146
|
+
|
|
8147
|
+
continue;
|
|
8148
|
+
}
|
|
8149
|
+
|
|
8150
|
+
// found a hole - fill it with data from the end of the cache
|
|
8151
|
+
|
|
8152
|
+
uint32_t nh = 1;
|
|
8153
|
+
|
|
8154
|
+
// determine the size of the hole
|
|
8155
|
+
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
|
|
8156
|
+
nh++;
|
|
8157
|
+
}
|
|
8158
|
+
|
|
8159
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
|
8160
|
+
// - source view, destination view, copy operation
|
|
8161
|
+
// - x2 for keys and values
|
|
8162
|
+
//
|
|
8163
|
+
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
|
8164
|
+
// the graph is too big, we cannot move more cells
|
|
8165
|
+
break;
|
|
8166
|
+
}
|
|
8167
|
+
|
|
8168
|
+
uint32_t nf = 0;
|
|
8169
|
+
uint32_t is = n_kv - 1;
|
|
8170
|
+
|
|
8171
|
+
// starting from the end, find nh non-empty cells
|
|
8172
|
+
for (; is > i0; --is) {
|
|
8173
|
+
const auto & cell1 = kv_self.cells[is];
|
|
8174
|
+
|
|
8175
|
+
if (cell1.is_empty() || ids[is] != n_kv) {
|
|
8176
|
+
continue;
|
|
8177
|
+
}
|
|
8178
|
+
|
|
8179
|
+
// non-empty cell which is not yet moved
|
|
8180
|
+
nf++;
|
|
8181
|
+
|
|
8182
|
+
if (nf == nh) {
|
|
8183
|
+
break;
|
|
8184
|
+
}
|
|
8185
|
+
}
|
|
8186
|
+
|
|
8187
|
+
// this can only happen if `n_used` is not accurate, which would be a bug
|
|
8188
|
+
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
|
8189
|
+
|
|
8190
|
+
nf = 0;
|
|
8191
|
+
|
|
8192
|
+
uint32_t i1 = is;
|
|
8193
|
+
|
|
8194
|
+
// are we moving a continuous block of memory?
|
|
8195
|
+
bool cont = false;
|
|
8196
|
+
|
|
8197
|
+
// go back and move the nf cells to the hole
|
|
8198
|
+
for (; i1 < n_kv; ++i1) {
|
|
8199
|
+
auto & cell1 = kv_self.cells[i1];
|
|
8200
|
+
|
|
8201
|
+
if (cell1.is_empty() || ids[i1] != n_kv) {
|
|
8202
|
+
cont = false;
|
|
8203
|
+
continue;
|
|
8204
|
+
}
|
|
8205
|
+
|
|
8206
|
+
// this cell goes to (i0 + nf)
|
|
8207
|
+
ids[i1] = i0 + nf;
|
|
8208
|
+
|
|
8209
|
+
// move the cell meta data
|
|
8210
|
+
kv_self.cells[i0 + nf] = cell1;
|
|
8211
|
+
|
|
8212
|
+
// clear the old cell and move the head there
|
|
8213
|
+
cell1 = llama_kv_cell();
|
|
8214
|
+
kv_self.head = n_used;
|
|
8215
|
+
|
|
8216
|
+
if (!cont) {
|
|
8217
|
+
n_moves++;
|
|
8218
|
+
cont = true;
|
|
8219
|
+
}
|
|
8220
|
+
|
|
8221
|
+
nf++;
|
|
8222
|
+
|
|
8223
|
+
if (nf == nh) {
|
|
8224
|
+
break;
|
|
8225
|
+
}
|
|
8226
|
+
}
|
|
8227
|
+
|
|
8228
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
|
8229
|
+
|
|
8230
|
+
i0 += nh - 1;
|
|
8231
|
+
}
|
|
8232
|
+
|
|
8233
|
+
if (n_moves == 0) {
|
|
8234
|
+
return;
|
|
8235
|
+
}
|
|
8236
|
+
|
|
8237
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
|
8238
|
+
|
|
8239
|
+
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
|
8240
|
+
|
|
8241
|
+
#if 0
|
|
8242
|
+
// CPU defrag
|
|
8243
|
+
//
|
|
8244
|
+
// TODO: optimizations are possible:
|
|
8245
|
+
// - multiple threads
|
|
8246
|
+
// - avoid copying to the host memory when already there
|
|
8247
|
+
//
|
|
8248
|
+
// likely not worth the effort, as we have ggml_graph based defrag
|
|
8249
|
+
//
|
|
8250
|
+
|
|
8251
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
8252
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
8253
|
+
|
|
8254
|
+
const uint32_t kv_size = kv_self.size;
|
|
8255
|
+
|
|
8256
|
+
std::vector<uint8_t> buf_k;
|
|
8257
|
+
std::vector<uint8_t> buf_v;
|
|
8258
|
+
|
|
8259
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
8260
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
|
8261
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
|
|
8262
|
+
|
|
8263
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
|
8264
|
+
const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
|
|
8265
|
+
|
|
8266
|
+
buf_k.resize(k_size);
|
|
8267
|
+
buf_v.resize(v_size);
|
|
8268
|
+
|
|
8269
|
+
ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
|
8270
|
+
ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
|
8271
|
+
|
|
8272
|
+
// batch move [i, i+nm) to [id, id+nm)
|
|
8273
|
+
// note: cells can move only to a lower index
|
|
8274
|
+
for (uint32_t i = 0; i < n_kv; ++i) {
|
|
8275
|
+
const uint32_t id = ids[i];
|
|
8276
|
+
|
|
8277
|
+
if (i == id || id == n_kv) {
|
|
8278
|
+
continue;
|
|
8279
|
+
}
|
|
8280
|
+
|
|
8281
|
+
uint32_t nm = 1;
|
|
8282
|
+
|
|
8283
|
+
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
|
8284
|
+
nm++;
|
|
8285
|
+
}
|
|
8286
|
+
|
|
8287
|
+
// move keys
|
|
8288
|
+
{
|
|
8289
|
+
const int64_t os = i*k_size_row;
|
|
8290
|
+
const int64_t od = id*k_size_row;
|
|
8291
|
+
|
|
8292
|
+
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
|
8293
|
+
}
|
|
8294
|
+
|
|
8295
|
+
// move values (note: they are transposed)
|
|
8296
|
+
{
|
|
8297
|
+
const int64_t os = i;
|
|
8298
|
+
const int64_t od = id;
|
|
8299
|
+
|
|
8300
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
8301
|
+
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
|
8302
|
+
}
|
|
8303
|
+
}
|
|
8304
|
+
|
|
8305
|
+
i += nm - 1;
|
|
8306
|
+
}
|
|
8307
|
+
|
|
8308
|
+
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
|
8309
|
+
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
|
8310
|
+
}
|
|
8311
|
+
#else
|
|
8312
|
+
// ggml_graph defrag
|
|
8313
|
+
|
|
8314
|
+
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
|
8315
|
+
|
|
8316
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
|
8317
|
+
#endif
|
|
8318
|
+
|
|
8319
|
+
//const int64_t t_end = ggml_time_us();
|
|
7624
8320
|
|
|
7625
|
-
//
|
|
7626
|
-
|
|
7627
|
-
// need to update the graphs to skip "result_output"
|
|
7628
|
-
if (res) {
|
|
7629
|
-
auto & logits_out = lctx.logits;
|
|
8321
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
|
8322
|
+
}
|
|
7630
8323
|
|
|
7631
|
-
|
|
7632
|
-
|
|
7633
|
-
|
|
7634
|
-
|
|
8324
|
+
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
8325
|
+
// apply K-shift if needed
|
|
8326
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
|
8327
|
+
llama_set_k_shift(lctx);
|
|
7635
8328
|
|
|
7636
|
-
|
|
7637
|
-
|
|
8329
|
+
{
|
|
8330
|
+
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
|
7638
8331
|
|
|
7639
|
-
|
|
7640
|
-
GGML_ASSERT(res_backend != nullptr);
|
|
7641
|
-
if (batch.logits) {
|
|
7642
|
-
logits_out.resize(n_vocab * n_tokens);
|
|
7643
|
-
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
7644
|
-
if (batch.logits[i] == 0) {
|
|
7645
|
-
continue;
|
|
7646
|
-
}
|
|
7647
|
-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
|
7648
|
-
#ifndef NDEBUG
|
|
7649
|
-
logits_valid[i] = true;
|
|
7650
|
-
#endif
|
|
7651
|
-
}
|
|
7652
|
-
} else if (lctx.logits_all) {
|
|
7653
|
-
logits_out.resize(n_vocab * n_tokens);
|
|
7654
|
-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
|
7655
|
-
#ifndef NDEBUG
|
|
7656
|
-
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
|
7657
|
-
#endif
|
|
7658
|
-
} else {
|
|
7659
|
-
logits_out.resize(n_vocab);
|
|
7660
|
-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
|
7661
|
-
#ifndef NDEBUG
|
|
7662
|
-
logits_valid[0] = true;
|
|
7663
|
-
#endif
|
|
8332
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
|
7664
8333
|
}
|
|
7665
|
-
ggml_backend_synchronize(res_backend);
|
|
7666
|
-
}
|
|
7667
8334
|
|
|
7668
|
-
|
|
7669
|
-
|
|
7670
|
-
auto & embedding_out = lctx.embedding;
|
|
8335
|
+
{
|
|
8336
|
+
auto & kv_self = lctx.kv_self;
|
|
7671
8337
|
|
|
7672
|
-
|
|
7673
|
-
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
|
8338
|
+
kv_self.has_shift = false;
|
|
7674
8339
|
|
|
7675
|
-
|
|
7676
|
-
|
|
7677
|
-
|
|
7678
|
-
|
|
8340
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
|
8341
|
+
kv_self.cells[i].delta = 0;
|
|
8342
|
+
}
|
|
8343
|
+
}
|
|
7679
8344
|
}
|
|
7680
8345
|
|
|
7681
|
-
//
|
|
7682
|
-
if (
|
|
7683
|
-
lctx
|
|
7684
|
-
lctx.n_eval++;
|
|
7685
|
-
}
|
|
7686
|
-
else if (n_tokens > 1) {
|
|
7687
|
-
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
|
7688
|
-
lctx.n_p_eval += n_tokens;
|
|
7689
|
-
}
|
|
8346
|
+
// defragment the KV cache if needed
|
|
8347
|
+
if (lctx.kv_self.do_defrag) {
|
|
8348
|
+
llama_kv_cache_defrag_internal(lctx);
|
|
7690
8349
|
|
|
7691
|
-
|
|
7692
|
-
// TODO: fix this
|
|
7693
|
-
if (!lctx.has_evaluated_once) {
|
|
7694
|
-
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
|
7695
|
-
lctx.has_evaluated_once = true;
|
|
8350
|
+
lctx.kv_self.do_defrag = false;
|
|
7696
8351
|
}
|
|
7697
|
-
|
|
7698
|
-
return 0;
|
|
7699
8352
|
}
|
|
7700
8353
|
|
|
7701
8354
|
//
|
|
@@ -8289,37 +8942,46 @@ struct llm_tokenizer_wpm {
|
|
|
8289
8942
|
}
|
|
8290
8943
|
|
|
8291
8944
|
std::vector<std::string> preprocess(const std::string & text) {
|
|
8292
|
-
|
|
8293
|
-
|
|
8945
|
+
// normalalization form D
|
|
8946
|
+
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
|
8947
|
+
std::vector<uint32_t> nfd_codepoints;
|
|
8948
|
+
for (uint32_t code : codepoints) {
|
|
8949
|
+
auto it = nfd_map.equal_range(code);
|
|
8950
|
+
if (it.first != it.second) {
|
|
8951
|
+
for (auto jt = it.first; jt != it.second; jt++) {
|
|
8952
|
+
nfd_codepoints.push_back(jt->second);
|
|
8953
|
+
}
|
|
8954
|
+
} else {
|
|
8955
|
+
nfd_codepoints.push_back(code);
|
|
8956
|
+
}
|
|
8957
|
+
}
|
|
8294
8958
|
|
|
8295
|
-
//
|
|
8296
|
-
//
|
|
8297
|
-
std::vector<std::string> words;
|
|
8959
|
+
// strip accents, strip control, uniformize whitespace,
|
|
8960
|
+
// to lowercase, pad chinese characters, pad punctuation
|
|
8298
8961
|
std::string new_str = "";
|
|
8299
|
-
|
|
8300
|
-
|
|
8301
|
-
|
|
8302
|
-
|
|
8303
|
-
|
|
8304
|
-
|
|
8305
|
-
|
|
8306
|
-
|
|
8962
|
+
for (uint32_t code : nfd_codepoints) {
|
|
8963
|
+
int type = codepoint_type(code);
|
|
8964
|
+
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
|
8965
|
+
continue;
|
|
8966
|
+
}
|
|
8967
|
+
code = to_lower(code);
|
|
8968
|
+
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
|
8969
|
+
code = ' ';
|
|
8307
8970
|
}
|
|
8308
|
-
|
|
8971
|
+
std::string s = codepoint_to_utf8(code);
|
|
8972
|
+
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
|
8309
8973
|
new_str += " ";
|
|
8310
|
-
new_str +=
|
|
8974
|
+
new_str += s;
|
|
8311
8975
|
new_str += " ";
|
|
8312
|
-
|
|
8313
|
-
|
|
8314
|
-
else {
|
|
8315
|
-
new_str += ori_str[i];
|
|
8316
|
-
i += 1;
|
|
8976
|
+
} else {
|
|
8977
|
+
new_str += s;
|
|
8317
8978
|
}
|
|
8318
8979
|
}
|
|
8319
8980
|
|
|
8320
8981
|
// split by whitespace
|
|
8321
8982
|
uint64_t l = 0;
|
|
8322
8983
|
uint64_t r = 0;
|
|
8984
|
+
std::vector<std::string> words;
|
|
8323
8985
|
while (r < new_str.size()) {
|
|
8324
8986
|
// if is whitespace
|
|
8325
8987
|
if (isspace(new_str[r])) {
|
|
@@ -8337,47 +8999,21 @@ struct llm_tokenizer_wpm {
|
|
|
8337
8999
|
return words;
|
|
8338
9000
|
}
|
|
8339
9001
|
|
|
8340
|
-
|
|
8341
|
-
|
|
8342
|
-
|
|
8343
|
-
|
|
8344
|
-
|
|
8345
|
-
if (c >= 'A' && c <= 'Z') {
|
|
8346
|
-
text2[i] = c - 'A' + 'a';
|
|
8347
|
-
}
|
|
9002
|
+
uint32_t to_lower(uint32_t code) {
|
|
9003
|
+
static const std::locale locale("en_US.UTF-8");
|
|
9004
|
+
#if defined(_WIN32)
|
|
9005
|
+
if (code > 0xFFFF) {
|
|
9006
|
+
return code;
|
|
8348
9007
|
}
|
|
8349
|
-
|
|
9008
|
+
#endif
|
|
9009
|
+
return std::tolower(wchar_t(code), locale);
|
|
8350
9010
|
}
|
|
8351
9011
|
|
|
8352
|
-
bool
|
|
8353
|
-
|
|
8354
|
-
|
|
8355
|
-
|
|
8356
|
-
|
|
8357
|
-
unsigned char ch = static_cast<unsigned char>(str[i]);
|
|
8358
|
-
if (ch <= 0x7f) {
|
|
8359
|
-
codepoint = ch;
|
|
8360
|
-
num_bytes = 1;
|
|
8361
|
-
} else if ((ch >> 5) == 0x06) {
|
|
8362
|
-
codepoint = ch & 0x1f;
|
|
8363
|
-
num_bytes = 2;
|
|
8364
|
-
} else if ((ch >> 4) == 0x0e) {
|
|
8365
|
-
codepoint = ch & 0x0f;
|
|
8366
|
-
num_bytes = 3;
|
|
8367
|
-
} else if ((ch >> 3) == 0x1e) {
|
|
8368
|
-
codepoint = ch & 0x07;
|
|
8369
|
-
num_bytes = 4;
|
|
8370
|
-
}
|
|
8371
|
-
for (int j = 1; j < num_bytes; ++j) {
|
|
8372
|
-
if (i + j >= len) {
|
|
8373
|
-
return false; // incomplete UTF-8 character
|
|
8374
|
-
}
|
|
8375
|
-
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
|
8376
|
-
if ((next_ch >> 6) != 0x02) {
|
|
8377
|
-
return false; // invalid trailing byte
|
|
8378
|
-
}
|
|
8379
|
-
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
|
8380
|
-
}
|
|
9012
|
+
bool is_ascii_punct(uint32_t code) {
|
|
9013
|
+
return code < 256 && ispunct(code);
|
|
9014
|
+
}
|
|
9015
|
+
|
|
9016
|
+
bool is_chinese_char(uint32_t codepoint) {
|
|
8381
9017
|
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
|
8382
9018
|
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
|
8383
9019
|
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
|
@@ -8393,41 +9029,6 @@ struct llm_tokenizer_wpm {
|
|
|
8393
9029
|
return false;
|
|
8394
9030
|
}
|
|
8395
9031
|
|
|
8396
|
-
std::string strip_accents(const std::string & input_string) {
|
|
8397
|
-
std::string resultString;
|
|
8398
|
-
std::map<std::string, char> accent_map = {
|
|
8399
|
-
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
|
8400
|
-
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
|
8401
|
-
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
|
8402
|
-
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
|
8403
|
-
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
|
8404
|
-
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
|
8405
|
-
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
|
8406
|
-
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
|
8407
|
-
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
|
8408
|
-
};
|
|
8409
|
-
|
|
8410
|
-
for (size_t i = 0; i < input_string.length();) {
|
|
8411
|
-
int len = utf8_len(input_string[i]);
|
|
8412
|
-
std::string curChar = input_string.substr(i, len);
|
|
8413
|
-
auto iter = accent_map.find(curChar);
|
|
8414
|
-
if (iter != accent_map.end()) {
|
|
8415
|
-
resultString += iter->second;
|
|
8416
|
-
} else {
|
|
8417
|
-
resultString += curChar;
|
|
8418
|
-
}
|
|
8419
|
-
i += len;
|
|
8420
|
-
}
|
|
8421
|
-
|
|
8422
|
-
return resultString;
|
|
8423
|
-
}
|
|
8424
|
-
|
|
8425
|
-
static size_t utf8_len(char src) {
|
|
8426
|
-
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
|
8427
|
-
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
|
8428
|
-
return lookup[highbits];
|
|
8429
|
-
}
|
|
8430
|
-
|
|
8431
9032
|
const llama_vocab & vocab;
|
|
8432
9033
|
};
|
|
8433
9034
|
|
|
@@ -9461,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|
|
9461
10062
|
}
|
|
9462
10063
|
}
|
|
9463
10064
|
|
|
9464
|
-
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
|
9465
|
-
llama_sample_temp(ctx, candidates_p, temp);
|
|
9466
|
-
}
|
|
9467
|
-
|
|
9468
10065
|
void llama_sample_repetition_penalties(
|
|
9469
10066
|
struct llama_context * ctx,
|
|
9470
10067
|
llama_token_data_array * candidates,
|
|
@@ -9591,38 +10188,6 @@ void llama_sample_apply_guidance(
|
|
|
9591
10188
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
9592
10189
|
}
|
|
9593
10190
|
|
|
9594
|
-
void llama_sample_classifier_free_guidance(
|
|
9595
|
-
struct llama_context * ctx,
|
|
9596
|
-
llama_token_data_array * candidates,
|
|
9597
|
-
struct llama_context * guidance_ctx,
|
|
9598
|
-
float scale) {
|
|
9599
|
-
GGML_ASSERT(ctx);
|
|
9600
|
-
int64_t t_start_sample_us;
|
|
9601
|
-
|
|
9602
|
-
t_start_sample_us = ggml_time_us();
|
|
9603
|
-
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
9604
|
-
|
|
9605
|
-
GGML_ASSERT(n_vocab == candidates->size);
|
|
9606
|
-
GGML_ASSERT(!candidates->sorted);
|
|
9607
|
-
|
|
9608
|
-
std::vector<float> logits_base(n_vocab);
|
|
9609
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
|
9610
|
-
logits_base[i] = candidates->data[i].logit;
|
|
9611
|
-
}
|
|
9612
|
-
|
|
9613
|
-
float * logits_guidance = llama_get_logits(guidance_ctx);
|
|
9614
|
-
|
|
9615
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
9616
|
-
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
|
9617
|
-
t_start_sample_us = ggml_time_us();
|
|
9618
|
-
|
|
9619
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
|
9620
|
-
candidates->data[i].logit = logits_base[i];
|
|
9621
|
-
}
|
|
9622
|
-
|
|
9623
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
9624
|
-
}
|
|
9625
|
-
|
|
9626
10191
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
|
9627
10192
|
GGML_ASSERT(ctx);
|
|
9628
10193
|
|
|
@@ -10145,34 +10710,56 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10145
10710
|
return std::make_pair(i_layer, n_layer);
|
|
10146
10711
|
};
|
|
10147
10712
|
|
|
10148
|
-
|
|
10713
|
+
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
|
10714
|
+
// with the quantization of the output tensor
|
|
10715
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
|
10716
|
+
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
|
10149
10717
|
int nx = tensor->ne[0];
|
|
10150
10718
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
|
10151
10719
|
new_type = GGML_TYPE_Q8_0;
|
|
10152
10720
|
}
|
|
10153
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS
|
|
10721
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
10722
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
|
10154
10723
|
new_type = GGML_TYPE_Q5_K;
|
|
10155
10724
|
}
|
|
10156
10725
|
else if (new_type != GGML_TYPE_Q8_0) {
|
|
10157
10726
|
new_type = GGML_TYPE_Q6_K;
|
|
10158
10727
|
}
|
|
10159
10728
|
} else if (name == "token_embd.weight") {
|
|
10160
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS
|
|
10729
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
|
10730
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
|
10161
10731
|
new_type = GGML_TYPE_Q2_K;
|
|
10162
10732
|
}
|
|
10733
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
|
10734
|
+
new_type = GGML_TYPE_IQ3_S;
|
|
10735
|
+
}
|
|
10163
10736
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
10164
|
-
new_type =
|
|
10737
|
+
new_type = GGML_TYPE_IQ3_S;
|
|
10165
10738
|
}
|
|
10166
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS
|
|
10739
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
|
10740
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
|
10167
10741
|
if (name.find("attn_v.weight") != std::string::npos) {
|
|
10168
10742
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
|
10169
|
-
else new_type = GGML_TYPE_Q2_K;
|
|
10743
|
+
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
|
10170
10744
|
++qs.i_attention_wv;
|
|
10171
10745
|
}
|
|
10746
|
+
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
|
10747
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10748
|
+
}
|
|
10172
10749
|
else if (name.find("ffn_down") != std::string::npos) {
|
|
10173
|
-
if (qs.i_ffn_down < qs.n_ffn_down/8)
|
|
10750
|
+
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
|
10751
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
|
10752
|
+
}
|
|
10174
10753
|
++qs.i_ffn_down;
|
|
10175
10754
|
}
|
|
10755
|
+
else if (name.find("attn_output.weight") != std::string::npos) {
|
|
10756
|
+
if (qs.model.hparams.n_expert == 8) {
|
|
10757
|
+
new_type = GGML_TYPE_Q5_K;
|
|
10758
|
+
} else {
|
|
10759
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
|
10760
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
|
10761
|
+
}
|
|
10762
|
+
}
|
|
10176
10763
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
|
10177
10764
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
|
10178
10765
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
@@ -10181,12 +10768,27 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10181
10768
|
new_type = GGML_TYPE_Q4_K;
|
|
10182
10769
|
}
|
|
10183
10770
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
10184
|
-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ?
|
|
10771
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
|
10772
|
+
}
|
|
10773
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
|
10774
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10775
|
+
}
|
|
10776
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
|
10777
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10778
|
+
}
|
|
10779
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
|
10780
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10781
|
+
}
|
|
10782
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
|
10783
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10185
10784
|
}
|
|
10186
10785
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
10187
10786
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
|
10188
10787
|
}
|
|
10189
10788
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
10789
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
|
|
10790
|
+
new_type = GGML_TYPE_Q5_K;
|
|
10791
|
+
}
|
|
10190
10792
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
|
10191
10793
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
|
10192
10794
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
@@ -10210,14 +10812,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10210
10812
|
// TODO: explore better strategies
|
|
10211
10813
|
new_type = GGML_TYPE_Q8_0;
|
|
10212
10814
|
}
|
|
10213
|
-
else if (ftype ==
|
|
10214
|
-
new_type =
|
|
10815
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
|
10816
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
|
10817
|
+
}
|
|
10818
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
10819
|
+
new_type = GGML_TYPE_IQ2_S;
|
|
10820
|
+
}
|
|
10821
|
+
} else if (name.find("attn_q.weight") != std::string::npos) {
|
|
10822
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
|
10823
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
|
10824
|
+
}
|
|
10825
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
10826
|
+
new_type = GGML_TYPE_IQ2_S;
|
|
10215
10827
|
}
|
|
10216
10828
|
} else if (name.find("ffn_down") != std::string::npos) {
|
|
10217
10829
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
|
10218
10830
|
int i_layer = info.first, n_layer = info.second;
|
|
10219
10831
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
10220
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S
|
|
10832
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
|
10221
10833
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
|
10222
10834
|
}
|
|
10223
10835
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
|
@@ -10228,6 +10840,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10228
10840
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
|
10229
10841
|
: GGML_TYPE_Q3_K;
|
|
10230
10842
|
}
|
|
10843
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
|
10844
|
+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
|
10845
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10846
|
+
}
|
|
10231
10847
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
|
10232
10848
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
|
10233
10849
|
}
|
|
@@ -10239,6 +10855,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10239
10855
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
|
10240
10856
|
}
|
|
10241
10857
|
}
|
|
10858
|
+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
|
|
10859
|
+
new_type = GGML_TYPE_Q5_K;
|
|
10860
|
+
}
|
|
10242
10861
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
|
10243
10862
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
|
10244
10863
|
new_type = GGML_TYPE_Q5_K;
|
|
@@ -10254,39 +10873,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10254
10873
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
|
10255
10874
|
if (arch != LLM_ARCH_FALCON) {
|
|
10256
10875
|
if (qs.model.hparams.n_expert == 8) {
|
|
10257
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
|
10258
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
|
10259
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M
|
|
10876
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
10877
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
|
10878
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
|
10879
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
|
|
10260
10880
|
new_type = GGML_TYPE_Q5_K;
|
|
10261
10881
|
}
|
|
10262
10882
|
} else {
|
|
10263
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K
|
|
10264
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type =
|
|
10265
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
|
10266
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
10883
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
|
10884
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
|
10885
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
|
10886
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
|
10887
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
|
10267
10888
|
}
|
|
10268
10889
|
} else {
|
|
10269
10890
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
10270
10891
|
}
|
|
10271
10892
|
}
|
|
10272
10893
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
|
10273
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L
|
|
10894
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
|
10895
|
+
new_type = GGML_TYPE_Q4_K;
|
|
10896
|
+
}
|
|
10274
10897
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
|
10275
10898
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
|
10276
10899
|
}
|
|
10277
10900
|
else if (name.find("ffn_gate") != std::string::npos) {
|
|
10278
10901
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
|
10279
10902
|
int i_layer = info.first, n_layer = info.second;
|
|
10280
|
-
if (ftype ==
|
|
10281
|
-
new_type =
|
|
10903
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
|
10904
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
|
10282
10905
|
}
|
|
10283
10906
|
++qs.i_ffn_gate;
|
|
10284
10907
|
}
|
|
10285
10908
|
else if (name.find("ffn_up") != std::string::npos) {
|
|
10286
10909
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
|
10287
10910
|
int i_layer = info.first, n_layer = info.second;
|
|
10288
|
-
if (ftype ==
|
|
10289
|
-
new_type =
|
|
10911
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
|
10912
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
|
10290
10913
|
}
|
|
10291
10914
|
++qs.i_ffn_up;
|
|
10292
10915
|
}
|
|
@@ -10304,9 +10927,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10304
10927
|
//}
|
|
10305
10928
|
bool convert_incompatible_tensor = false;
|
|
10306
10929
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
|
10307
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
|
10308
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
|
10309
|
-
new_type == GGML_TYPE_IQ3_XXS) {
|
|
10930
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
|
10931
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
|
10932
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
|
10310
10933
|
int nx = tensor->ne[0];
|
|
10311
10934
|
int ny = tensor->ne[1];
|
|
10312
10935
|
if (nx % QK_K != 0) {
|
|
@@ -10320,12 +10943,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10320
10943
|
switch (new_type) {
|
|
10321
10944
|
case GGML_TYPE_IQ2_XXS:
|
|
10322
10945
|
case GGML_TYPE_IQ2_XS:
|
|
10946
|
+
case GGML_TYPE_IQ2_S:
|
|
10323
10947
|
case GGML_TYPE_IQ3_XXS:
|
|
10324
|
-
case
|
|
10325
|
-
case
|
|
10326
|
-
case
|
|
10327
|
-
case
|
|
10328
|
-
case
|
|
10948
|
+
case GGML_TYPE_IQ3_S:
|
|
10949
|
+
case GGML_TYPE_IQ1_S:
|
|
10950
|
+
case GGML_TYPE_Q2_K:
|
|
10951
|
+
case GGML_TYPE_Q3_K:
|
|
10952
|
+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
|
10953
|
+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
10954
|
+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
|
10955
|
+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
|
10329
10956
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
|
10330
10957
|
}
|
|
10331
10958
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
|
@@ -10351,7 +10978,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
10351
10978
|
// K-quants
|
|
10352
10979
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
|
10353
10980
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
|
10354
|
-
case
|
|
10981
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
|
10355
10982
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
|
10356
10983
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
|
10357
10984
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
@@ -10362,7 +10989,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
10362
10989
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
|
10363
10990
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
|
10364
10991
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
|
10992
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
|
|
10993
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
|
|
10365
10994
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
|
10995
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
|
10996
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
|
10997
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
|
|
10998
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
|
10999
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
|
10366
11000
|
|
|
10367
11001
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
|
10368
11002
|
}
|
|
@@ -10492,7 +11126,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
10492
11126
|
quantize &= !params->only_copy;
|
|
10493
11127
|
|
|
10494
11128
|
// do not quantize expert gating tensors
|
|
10495
|
-
|
|
11129
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
11130
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
10496
11131
|
|
|
10497
11132
|
// do not quantize positional embeddings and token types (BERT)
|
|
10498
11133
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
@@ -10536,6 +11171,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
10536
11171
|
}
|
|
10537
11172
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
|
10538
11173
|
new_type == GGML_TYPE_IQ2_XS ||
|
|
11174
|
+
new_type == GGML_TYPE_IQ2_S ||
|
|
11175
|
+
new_type == GGML_TYPE_IQ1_S ||
|
|
10539
11176
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
|
10540
11177
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
|
10541
11178
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
@@ -10770,7 +11407,7 @@ static int llama_apply_lora_from_file_internal(
|
|
|
10770
11407
|
{
|
|
10771
11408
|
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
|
10772
11409
|
__func__, ftype);
|
|
10773
|
-
return
|
|
11410
|
+
return 1;
|
|
10774
11411
|
}
|
|
10775
11412
|
}
|
|
10776
11413
|
|
|
@@ -10956,7 +11593,7 @@ static int llama_apply_lora_from_file_internal(
|
|
|
10956
11593
|
struct llama_model_params llama_model_default_params() {
|
|
10957
11594
|
struct llama_model_params result = {
|
|
10958
11595
|
/*.n_gpu_layers =*/ 0,
|
|
10959
|
-
/*.split_mode =*/
|
|
11596
|
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
10960
11597
|
/*.main_gpu =*/ 0,
|
|
10961
11598
|
/*.tensor_split =*/ nullptr,
|
|
10962
11599
|
/*.progress_callback =*/ nullptr,
|
|
@@ -10982,7 +11619,7 @@ struct llama_context_params llama_context_default_params() {
|
|
|
10982
11619
|
/*.n_batch =*/ 512,
|
|
10983
11620
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
|
10984
11621
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
|
10985
|
-
/*.rope_scaling_type =*/
|
|
11622
|
+
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
|
10986
11623
|
/*.rope_freq_base =*/ 0.0f,
|
|
10987
11624
|
/*.rope_freq_scale =*/ 0.0f,
|
|
10988
11625
|
/*.yarn_ext_factor =*/ -1.0f,
|
|
@@ -10990,11 +11627,11 @@ struct llama_context_params llama_context_default_params() {
|
|
|
10990
11627
|
/*.yarn_beta_fast =*/ 32.0f,
|
|
10991
11628
|
/*.yarn_beta_slow =*/ 1.0f,
|
|
10992
11629
|
/*.yarn_orig_ctx =*/ 0,
|
|
11630
|
+
/*.defrag_thold =*/ -1.0f,
|
|
10993
11631
|
/*.cb_eval =*/ nullptr,
|
|
10994
11632
|
/*.cb_eval_user_data =*/ nullptr,
|
|
10995
11633
|
/*.type_k =*/ GGML_TYPE_F16,
|
|
10996
11634
|
/*.type_v =*/ GGML_TYPE_F16,
|
|
10997
|
-
/*.mul_mat_q =*/ true,
|
|
10998
11635
|
/*.logits_all =*/ false,
|
|
10999
11636
|
/*.embedding =*/ false,
|
|
11000
11637
|
/*.offload_kqv =*/ true,
|
|
@@ -11050,16 +11687,7 @@ bool llama_supports_gpu_offload(void) {
|
|
|
11050
11687
|
#endif
|
|
11051
11688
|
}
|
|
11052
11689
|
|
|
11053
|
-
|
|
11054
|
-
bool llama_mmap_supported(void) {
|
|
11055
|
-
return llama_supports_mmap();
|
|
11056
|
-
}
|
|
11057
|
-
|
|
11058
|
-
bool llama_mlock_supported(void) {
|
|
11059
|
-
return llama_supports_mlock();
|
|
11060
|
-
}
|
|
11061
|
-
|
|
11062
|
-
void llama_backend_init(bool numa) {
|
|
11690
|
+
void llama_backend_init(void) {
|
|
11063
11691
|
ggml_time_init();
|
|
11064
11692
|
|
|
11065
11693
|
// needed to initialize f16 tables
|
|
@@ -11069,15 +11697,17 @@ void llama_backend_init(bool numa) {
|
|
|
11069
11697
|
ggml_free(ctx);
|
|
11070
11698
|
}
|
|
11071
11699
|
|
|
11072
|
-
if (numa) {
|
|
11073
|
-
ggml_numa_init();
|
|
11074
|
-
}
|
|
11075
|
-
|
|
11076
11700
|
#ifdef GGML_USE_MPI
|
|
11077
11701
|
ggml_mpi_backend_init();
|
|
11078
11702
|
#endif
|
|
11079
11703
|
}
|
|
11080
11704
|
|
|
11705
|
+
void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
11706
|
+
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
|
11707
|
+
ggml_numa_init(numa);
|
|
11708
|
+
}
|
|
11709
|
+
}
|
|
11710
|
+
|
|
11081
11711
|
void llama_backend_free(void) {
|
|
11082
11712
|
#ifdef GGML_USE_MPI
|
|
11083
11713
|
ggml_mpi_backend_free();
|
|
@@ -11152,7 +11782,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11152
11782
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
|
11153
11783
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
|
11154
11784
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
11155
|
-
cparams.
|
|
11785
|
+
cparams.defrag_thold = params.defrag_thold;
|
|
11156
11786
|
cparams.offload_kqv = params.offload_kqv;
|
|
11157
11787
|
cparams.do_pooling = params.do_pooling;
|
|
11158
11788
|
|
|
@@ -11168,16 +11798,16 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11168
11798
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
11169
11799
|
|
|
11170
11800
|
auto rope_scaling_type = params.rope_scaling_type;
|
|
11171
|
-
if (rope_scaling_type ==
|
|
11801
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
|
11172
11802
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
|
11173
11803
|
}
|
|
11174
11804
|
|
|
11175
|
-
if (rope_scaling_type ==
|
|
11805
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
|
11176
11806
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
|
11177
11807
|
}
|
|
11178
11808
|
|
|
11179
11809
|
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
|
11180
|
-
cparams.yarn_ext_factor = rope_scaling_type ==
|
|
11810
|
+
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
|
11181
11811
|
}
|
|
11182
11812
|
|
|
11183
11813
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
@@ -11211,8 +11841,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11211
11841
|
}
|
|
11212
11842
|
#elif defined(GGML_USE_CUBLAS)
|
|
11213
11843
|
if (model->n_gpu_layers > 0) {
|
|
11214
|
-
// with split_mode
|
|
11215
|
-
if (model->split_mode ==
|
|
11844
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
11845
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
11216
11846
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
|
11217
11847
|
if (backend == nullptr) {
|
|
11218
11848
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
|
@@ -11221,7 +11851,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11221
11851
|
}
|
|
11222
11852
|
ctx->backends.push_back(backend);
|
|
11223
11853
|
} else {
|
|
11224
|
-
//
|
|
11854
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
|
11225
11855
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
|
11226
11856
|
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
|
11227
11857
|
if (backend == nullptr) {
|
|
@@ -11274,8 +11904,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11274
11904
|
}
|
|
11275
11905
|
ctx->backends.push_back(ctx->backend_cpu);
|
|
11276
11906
|
|
|
11277
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
|
11278
|
-
cparams.n_ctx, cparams.offload_kqv)) {
|
|
11907
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
|
|
11279
11908
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
|
11280
11909
|
llama_free(ctx);
|
|
11281
11910
|
return nullptr;
|
|
@@ -11309,7 +11938,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11309
11938
|
// graph inputs
|
|
11310
11939
|
{
|
|
11311
11940
|
ggml_init_params init_params = {
|
|
11312
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
|
11941
|
+
/* .mem_size */ ggml_tensor_overhead()*8,
|
|
11313
11942
|
/* .mem_buffer */ nullptr,
|
|
11314
11943
|
/* .no_alloc */ true,
|
|
11315
11944
|
};
|
|
@@ -11319,15 +11948,19 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11319
11948
|
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
|
11320
11949
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
|
11321
11950
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
|
11951
|
+
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
|
11322
11952
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
|
11323
|
-
ctx->
|
|
11953
|
+
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
|
11954
|
+
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
|
11324
11955
|
|
|
11325
11956
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
|
11326
11957
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
|
11327
11958
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
|
11328
11959
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
|
11960
|
+
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
|
11329
11961
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
|
11330
|
-
ggml_set_name(ctx->
|
|
11962
|
+
ggml_set_name(ctx->inp_mean, "inp_mean");
|
|
11963
|
+
ggml_set_name(ctx->inp_cls, "inp_cls");
|
|
11331
11964
|
|
|
11332
11965
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
|
11333
11966
|
|
|
@@ -11350,7 +11983,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11350
11983
|
}
|
|
11351
11984
|
|
|
11352
11985
|
// buffer used to store the computation graph and the tensor meta data
|
|
11353
|
-
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES +
|
|
11986
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
|
11354
11987
|
|
|
11355
11988
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
|
11356
11989
|
|
|
@@ -11419,6 +12052,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
|
11419
12052
|
return model->vocab.type;
|
|
11420
12053
|
}
|
|
11421
12054
|
|
|
12055
|
+
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12056
|
+
switch (model->arch) {
|
|
12057
|
+
// these models do not use RoPE
|
|
12058
|
+
case LLM_ARCH_GPT2:
|
|
12059
|
+
case LLM_ARCH_GPTJ:
|
|
12060
|
+
case LLM_ARCH_GPTNEOX:
|
|
12061
|
+
case LLM_ARCH_MPT:
|
|
12062
|
+
case LLM_ARCH_REFACT:
|
|
12063
|
+
case LLM_ARCH_BLOOM:
|
|
12064
|
+
return LLAMA_ROPE_TYPE_NONE;
|
|
12065
|
+
|
|
12066
|
+
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
12067
|
+
case LLM_ARCH_LLAMA:
|
|
12068
|
+
case LLM_ARCH_BAICHUAN:
|
|
12069
|
+
case LLM_ARCH_STARCODER:
|
|
12070
|
+
case LLM_ARCH_PLAMO:
|
|
12071
|
+
case LLM_ARCH_CODESHELL:
|
|
12072
|
+
case LLM_ARCH_ORION:
|
|
12073
|
+
case LLM_ARCH_INTERNLM2:
|
|
12074
|
+
case LLM_ARCH_MINICPM:
|
|
12075
|
+
return LLAMA_ROPE_TYPE_NORM;
|
|
12076
|
+
|
|
12077
|
+
// the pairs of head values are offset by n_rot/2
|
|
12078
|
+
case LLM_ARCH_FALCON:
|
|
12079
|
+
case LLM_ARCH_PERSIMMON:
|
|
12080
|
+
case LLM_ARCH_BERT:
|
|
12081
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
12082
|
+
case LLM_ARCH_STABLELM:
|
|
12083
|
+
case LLM_ARCH_QWEN:
|
|
12084
|
+
case LLM_ARCH_QWEN2:
|
|
12085
|
+
case LLM_ARCH_PHI2:
|
|
12086
|
+
case LLM_ARCH_GEMMA:
|
|
12087
|
+
return LLAMA_ROPE_TYPE_NEOX;
|
|
12088
|
+
|
|
12089
|
+
// all model arches should be listed explicitly here
|
|
12090
|
+
case LLM_ARCH_UNKNOWN:
|
|
12091
|
+
GGML_ASSERT(false && "unknown architecture");
|
|
12092
|
+
break;
|
|
12093
|
+
}
|
|
12094
|
+
|
|
12095
|
+
return LLAMA_ROPE_TYPE_NONE;
|
|
12096
|
+
}
|
|
12097
|
+
|
|
11422
12098
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
|
11423
12099
|
return model->vocab.id_to_token.size();
|
|
11424
12100
|
}
|
|
@@ -11521,15 +12197,6 @@ uint32_t llama_model_quantize(
|
|
|
11521
12197
|
}
|
|
11522
12198
|
}
|
|
11523
12199
|
|
|
11524
|
-
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
|
11525
|
-
try {
|
|
11526
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
|
11527
|
-
} catch (const std::exception & err) {
|
|
11528
|
-
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
11529
|
-
return 1;
|
|
11530
|
-
}
|
|
11531
|
-
}
|
|
11532
|
-
|
|
11533
12200
|
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
|
11534
12201
|
try {
|
|
11535
12202
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
|
@@ -11661,12 +12328,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
|
11661
12328
|
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
|
11662
12329
|
}
|
|
11663
12330
|
|
|
11664
|
-
void
|
|
12331
|
+
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
|
11665
12332
|
if (delta == 0) {
|
|
11666
12333
|
return;
|
|
11667
12334
|
}
|
|
11668
12335
|
|
|
11669
|
-
|
|
12336
|
+
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
|
11670
12337
|
}
|
|
11671
12338
|
|
|
11672
12339
|
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
@@ -11677,6 +12344,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|
|
11677
12344
|
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
|
11678
12345
|
}
|
|
11679
12346
|
|
|
12347
|
+
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
12348
|
+
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
|
|
12349
|
+
}
|
|
12350
|
+
|
|
12351
|
+
void llama_kv_cache_defrag(struct llama_context * ctx) {
|
|
12352
|
+
llama_kv_cache_defrag(ctx->kv_self);
|
|
12353
|
+
}
|
|
12354
|
+
|
|
12355
|
+
void llama_kv_cache_update(struct llama_context * ctx) {
|
|
12356
|
+
llama_kv_cache_update_internal(*ctx);
|
|
12357
|
+
}
|
|
12358
|
+
|
|
12359
|
+
|
|
11680
12360
|
// Returns the *maximum* size of the state
|
|
11681
12361
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
11682
12362
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
|
@@ -11803,10 +12483,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
11803
12483
|
const auto & hparams = ctx->model.hparams;
|
|
11804
12484
|
const auto & cparams = ctx->cparams;
|
|
11805
12485
|
|
|
11806
|
-
const
|
|
11807
|
-
const
|
|
11808
|
-
const
|
|
11809
|
-
const
|
|
12486
|
+
const uint32_t n_layer = hparams.n_layer;
|
|
12487
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
12488
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
12489
|
+
const uint32_t n_ctx = cparams.n_ctx;
|
|
11810
12490
|
|
|
11811
12491
|
const size_t kv_buf_size = kv_self.total_size();
|
|
11812
12492
|
const uint32_t kv_head = kv_self.head;
|
|
@@ -11819,18 +12499,21 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
11819
12499
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
|
11820
12500
|
|
|
11821
12501
|
if (kv_buf_size) {
|
|
11822
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
|
11823
|
-
|
|
11824
12502
|
std::vector<uint8_t> tmp_buf;
|
|
11825
12503
|
for (int il = 0; il < (int) n_layer; ++il) {
|
|
11826
|
-
|
|
12504
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
|
12505
|
+
|
|
12506
|
+
tmp_buf.resize(k_size);
|
|
11827
12507
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
|
11828
12508
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
|
11829
12509
|
|
|
11830
12510
|
// v is not contiguous, copy row by row
|
|
11831
|
-
|
|
12511
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
|
12512
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
|
12513
|
+
|
|
12514
|
+
tmp_buf.resize(v_row_size);
|
|
11832
12515
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
|
11833
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*
|
|
12516
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
|
11834
12517
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
|
11835
12518
|
}
|
|
11836
12519
|
}
|
|
@@ -11860,8 +12543,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
|
11860
12543
|
}
|
|
11861
12544
|
|
|
11862
12545
|
// Sets the state reading from the specified source address
|
|
11863
|
-
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
11864
|
-
uint8_t * inp = src;
|
|
12546
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12547
|
+
const uint8_t * inp = src;
|
|
11865
12548
|
|
|
11866
12549
|
// set rng
|
|
11867
12550
|
{
|
|
@@ -11870,7 +12553,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
|
11870
12553
|
|
|
11871
12554
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
|
11872
12555
|
|
|
11873
|
-
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
|
12556
|
+
std::string rng_str((const char *)inp, rng_size); inp += rng_size;
|
|
11874
12557
|
|
|
11875
12558
|
std::istringstream rng_ss(rng_str);
|
|
11876
12559
|
rng_ss >> ctx->rng;
|
|
@@ -11914,10 +12597,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
|
11914
12597
|
const auto & hparams = ctx->model.hparams;
|
|
11915
12598
|
const auto & cparams = ctx->cparams;
|
|
11916
12599
|
|
|
11917
|
-
const
|
|
11918
|
-
const
|
|
11919
|
-
const
|
|
11920
|
-
const
|
|
12600
|
+
const uint32_t n_layer = hparams.n_layer;
|
|
12601
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
12602
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
12603
|
+
const uint32_t n_ctx = cparams.n_ctx;
|
|
11921
12604
|
|
|
11922
12605
|
size_t kv_buf_size;
|
|
11923
12606
|
uint32_t kv_head;
|
|
@@ -11932,17 +12615,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
|
11932
12615
|
if (kv_buf_size) {
|
|
11933
12616
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
|
11934
12617
|
|
|
11935
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
|
11936
|
-
|
|
11937
12618
|
for (int il = 0; il < (int) n_layer; ++il) {
|
|
11938
|
-
size_t k_size =
|
|
12619
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
|
12620
|
+
|
|
11939
12621
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
|
11940
12622
|
inp += k_size;
|
|
11941
12623
|
|
|
11942
12624
|
// v is not contiguous, copy row by row
|
|
11943
|
-
size_t v_row_size
|
|
12625
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
|
12626
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
|
12627
|
+
|
|
11944
12628
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
|
11945
|
-
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*
|
|
12629
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
|
11946
12630
|
inp += v_row_size;
|
|
11947
12631
|
}
|
|
11948
12632
|
}
|
|
@@ -12062,38 +12746,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
|
12062
12746
|
return true;
|
|
12063
12747
|
}
|
|
12064
12748
|
|
|
12065
|
-
int llama_eval(
|
|
12066
|
-
struct llama_context * ctx,
|
|
12067
|
-
llama_token * tokens,
|
|
12068
|
-
int32_t n_tokens,
|
|
12069
|
-
int32_t n_past) {
|
|
12070
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
|
12071
|
-
|
|
12072
|
-
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
|
12073
|
-
if (ret < 0) {
|
|
12074
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
|
12075
|
-
}
|
|
12076
|
-
|
|
12077
|
-
return ret;
|
|
12078
|
-
}
|
|
12079
|
-
|
|
12080
|
-
int llama_eval_embd(
|
|
12081
|
-
struct llama_context * ctx,
|
|
12082
|
-
float * embd,
|
|
12083
|
-
int32_t n_tokens,
|
|
12084
|
-
int32_t n_past) {
|
|
12085
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
|
12086
|
-
|
|
12087
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
|
12088
|
-
|
|
12089
|
-
const int ret = llama_decode_internal(*ctx, batch);
|
|
12090
|
-
if (ret < 0) {
|
|
12091
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
|
12092
|
-
}
|
|
12093
|
-
|
|
12094
|
-
return ret;
|
|
12095
|
-
}
|
|
12096
|
-
|
|
12097
12749
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
|
12098
12750
|
ctx->cparams.n_threads = n_threads;
|
|
12099
12751
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
|
@@ -12332,6 +12984,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
|
12332
12984
|
return 0;
|
|
12333
12985
|
}
|
|
12334
12986
|
|
|
12987
|
+
// trim whitespace from the beginning and end of a string
|
|
12988
|
+
static std::string trim(const std::string & str) {
|
|
12989
|
+
size_t start = 0;
|
|
12990
|
+
size_t end = str.size();
|
|
12991
|
+
while (start < end && isspace(str[start])) {
|
|
12992
|
+
start += 1;
|
|
12993
|
+
}
|
|
12994
|
+
while (end > start && isspace(str[end - 1])) {
|
|
12995
|
+
end -= 1;
|
|
12996
|
+
}
|
|
12997
|
+
return str.substr(start, end - start);
|
|
12998
|
+
}
|
|
12999
|
+
|
|
13000
|
+
// Simple version of "llama_apply_chat_template" that only works with strings
|
|
13001
|
+
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
|
13002
|
+
static int32_t llama_chat_apply_template_internal(
|
|
13003
|
+
const std::string & tmpl,
|
|
13004
|
+
const std::vector<const llama_chat_message *> & chat,
|
|
13005
|
+
std::string & dest, bool add_ass) {
|
|
13006
|
+
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
|
13007
|
+
std::stringstream ss;
|
|
13008
|
+
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
|
13009
|
+
// chatml template
|
|
13010
|
+
for (auto message : chat) {
|
|
13011
|
+
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
|
13012
|
+
}
|
|
13013
|
+
if (add_ass) {
|
|
13014
|
+
ss << "<|im_start|>assistant\n";
|
|
13015
|
+
}
|
|
13016
|
+
} else if (tmpl.find("[INST]") != std::string::npos) {
|
|
13017
|
+
// llama2 template and its variants
|
|
13018
|
+
// [variant] support system message
|
|
13019
|
+
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
|
13020
|
+
// [variant] space before + after response
|
|
13021
|
+
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
|
13022
|
+
// [variant] add BOS inside history
|
|
13023
|
+
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
|
13024
|
+
// [variant] trim spaces from the input message
|
|
13025
|
+
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
|
13026
|
+
// construct the prompt
|
|
13027
|
+
bool is_inside_turn = true; // skip BOS at the beginning
|
|
13028
|
+
ss << "[INST] ";
|
|
13029
|
+
for (auto message : chat) {
|
|
13030
|
+
std::string content = strip_message ? trim(message->content) : message->content;
|
|
13031
|
+
std::string role(message->role);
|
|
13032
|
+
if (!is_inside_turn) {
|
|
13033
|
+
is_inside_turn = true;
|
|
13034
|
+
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
|
13035
|
+
}
|
|
13036
|
+
if (role == "system") {
|
|
13037
|
+
if (support_system_message) {
|
|
13038
|
+
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
|
13039
|
+
} else {
|
|
13040
|
+
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
|
13041
|
+
ss << content << "\n";
|
|
13042
|
+
}
|
|
13043
|
+
} else if (role == "user") {
|
|
13044
|
+
ss << content << " [/INST]";
|
|
13045
|
+
} else {
|
|
13046
|
+
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
|
13047
|
+
is_inside_turn = false;
|
|
13048
|
+
}
|
|
13049
|
+
}
|
|
13050
|
+
// llama2 templates seem to not care about "add_generation_prompt"
|
|
13051
|
+
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
|
13052
|
+
// zephyr template
|
|
13053
|
+
for (auto message : chat) {
|
|
13054
|
+
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
|
13055
|
+
}
|
|
13056
|
+
if (add_ass) {
|
|
13057
|
+
ss << "<|assistant|>\n";
|
|
13058
|
+
}
|
|
13059
|
+
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
|
13060
|
+
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
|
13061
|
+
for (auto message : chat) {
|
|
13062
|
+
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
|
13063
|
+
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
|
13064
|
+
}
|
|
13065
|
+
if (add_ass) {
|
|
13066
|
+
ss << "<s>assistant\n";
|
|
13067
|
+
}
|
|
13068
|
+
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
|
13069
|
+
// google/gemma-7b-it
|
|
13070
|
+
std::string system_prompt = "";
|
|
13071
|
+
for (auto message : chat) {
|
|
13072
|
+
std::string role(message->role);
|
|
13073
|
+
if (role == "system") {
|
|
13074
|
+
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
|
13075
|
+
system_prompt = trim(message->content);
|
|
13076
|
+
continue;
|
|
13077
|
+
}
|
|
13078
|
+
// in gemma, "assistant" is "model"
|
|
13079
|
+
role = role == "assistant" ? "model" : message->role;
|
|
13080
|
+
ss << "<start_of_turn>" << role << "\n";
|
|
13081
|
+
if (!system_prompt.empty() && role != "model") {
|
|
13082
|
+
ss << system_prompt << "\n\n";
|
|
13083
|
+
system_prompt = "";
|
|
13084
|
+
}
|
|
13085
|
+
ss << trim(message->content) << "<end_of_turn>\n";
|
|
13086
|
+
}
|
|
13087
|
+
if (add_ass) {
|
|
13088
|
+
ss << "<start_of_turn>model\n";
|
|
13089
|
+
}
|
|
13090
|
+
} else {
|
|
13091
|
+
// template not supported
|
|
13092
|
+
return -1;
|
|
13093
|
+
}
|
|
13094
|
+
dest = ss.str();
|
|
13095
|
+
return dest.size();
|
|
13096
|
+
}
|
|
13097
|
+
|
|
13098
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
|
13099
|
+
const struct llama_model * model,
|
|
13100
|
+
const char * tmpl,
|
|
13101
|
+
const struct llama_chat_message * chat,
|
|
13102
|
+
size_t n_msg,
|
|
13103
|
+
bool add_ass,
|
|
13104
|
+
char * buf,
|
|
13105
|
+
int32_t length) {
|
|
13106
|
+
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
|
13107
|
+
if (tmpl == nullptr) {
|
|
13108
|
+
GGML_ASSERT(model != nullptr);
|
|
13109
|
+
// load template from model
|
|
13110
|
+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
|
13111
|
+
std::string template_key = "tokenizer.chat_template";
|
|
13112
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
13113
|
+
if (res < 0) {
|
|
13114
|
+
// worst case: there is no information about template, we will use chatml by default
|
|
13115
|
+
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
|
13116
|
+
} else {
|
|
13117
|
+
curr_tmpl = std::string(model_template.data(), model_template.size());
|
|
13118
|
+
}
|
|
13119
|
+
}
|
|
13120
|
+
// format the chat to string
|
|
13121
|
+
std::vector<const llama_chat_message *> chat_vec;
|
|
13122
|
+
chat_vec.resize(n_msg);
|
|
13123
|
+
for (size_t i = 0; i < n_msg; i++) {
|
|
13124
|
+
chat_vec[i] = &chat[i];
|
|
13125
|
+
}
|
|
13126
|
+
std::string formatted_chat;
|
|
13127
|
+
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
|
13128
|
+
if (res < 0) {
|
|
13129
|
+
return res;
|
|
13130
|
+
}
|
|
13131
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
|
13132
|
+
return res;
|
|
13133
|
+
}
|
|
13134
|
+
|
|
12335
13135
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
12336
13136
|
struct llama_timings result = {
|
|
12337
13137
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|