llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -196,6 +196,8 @@ enum llm_arch {
|
|
196
196
|
LLM_ARCH_STARCODER,
|
197
197
|
LLM_ARCH_PERSIMMON,
|
198
198
|
LLM_ARCH_REFACT,
|
199
|
+
LLM_ARCH_BERT,
|
200
|
+
LLM_ARCH_NOMIC_BERT,
|
199
201
|
LLM_ARCH_BLOOM,
|
200
202
|
LLM_ARCH_STABLELM,
|
201
203
|
LLM_ARCH_QWEN,
|
@@ -206,30 +208,34 @@ enum llm_arch {
|
|
206
208
|
LLM_ARCH_ORION,
|
207
209
|
LLM_ARCH_INTERNLM2,
|
208
210
|
LLM_ARCH_MINICPM,
|
211
|
+
LLM_ARCH_GEMMA,
|
209
212
|
LLM_ARCH_UNKNOWN,
|
210
213
|
};
|
211
214
|
|
212
215
|
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
213
|
-
{ LLM_ARCH_LLAMA, "llama"
|
214
|
-
{ LLM_ARCH_FALCON, "falcon"
|
215
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
216
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
217
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
218
|
-
{ LLM_ARCH_MPT, "mpt"
|
219
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
220
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
221
|
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
222
|
-
{ LLM_ARCH_REFACT, "refact"
|
223
|
-
{
|
224
|
-
{
|
225
|
-
{
|
226
|
-
{
|
227
|
-
{
|
228
|
-
{
|
229
|
-
{
|
230
|
-
{
|
231
|
-
{
|
232
|
-
{
|
216
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
217
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
218
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
219
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
220
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
221
|
+
{ LLM_ARCH_MPT, "mpt" },
|
222
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
223
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
224
|
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
225
|
+
{ LLM_ARCH_REFACT, "refact" },
|
226
|
+
{ LLM_ARCH_BERT, "bert" },
|
227
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
228
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
229
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
230
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
231
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
232
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
233
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
234
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
235
|
+
{ LLM_ARCH_ORION, "orion" },
|
236
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
237
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
238
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
233
239
|
};
|
234
240
|
|
235
241
|
enum llm_kv {
|
@@ -252,6 +258,7 @@ enum llm_kv {
|
|
252
258
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
253
259
|
LLM_KV_EXPERT_COUNT,
|
254
260
|
LLM_KV_EXPERT_USED_COUNT,
|
261
|
+
LLM_KV_POOLING_TYPE,
|
255
262
|
|
256
263
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
257
264
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -261,6 +268,7 @@ enum llm_kv {
|
|
261
268
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
262
269
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
263
270
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
271
|
+
LLM_KV_ATTENTION_CAUSAL,
|
264
272
|
|
265
273
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
266
274
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -273,6 +281,7 @@ enum llm_kv {
|
|
273
281
|
LLM_KV_TOKENIZER_MODEL,
|
274
282
|
LLM_KV_TOKENIZER_LIST,
|
275
283
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
284
|
+
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
276
285
|
LLM_KV_TOKENIZER_SCORES,
|
277
286
|
LLM_KV_TOKENIZER_MERGES,
|
278
287
|
LLM_KV_TOKENIZER_BOS_ID,
|
@@ -307,6 +316,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
307
316
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
308
317
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
309
318
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
319
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
310
320
|
|
311
321
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
312
322
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -316,6 +326,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
316
326
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
317
327
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
318
328
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
329
|
+
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
319
330
|
|
320
331
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
321
332
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -328,6 +339,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
328
339
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
329
340
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
330
341
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
342
|
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
331
343
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
332
344
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
333
345
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
@@ -355,6 +367,7 @@ struct LLM_KV {
|
|
355
367
|
enum llm_tensor {
|
356
368
|
LLM_TENSOR_TOKEN_EMBD,
|
357
369
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
370
|
+
LLM_TENSOR_TOKEN_TYPES,
|
358
371
|
LLM_TENSOR_POS_EMBD,
|
359
372
|
LLM_TENSOR_OUTPUT,
|
360
373
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -366,6 +379,7 @@ enum llm_tensor {
|
|
366
379
|
LLM_TENSOR_ATTN_OUT,
|
367
380
|
LLM_TENSOR_ATTN_NORM,
|
368
381
|
LLM_TENSOR_ATTN_NORM_2,
|
382
|
+
LLM_TENSOR_ATTN_OUT_NORM,
|
369
383
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
370
384
|
LLM_TENSOR_FFN_GATE_INP,
|
371
385
|
LLM_TENSOR_FFN_NORM,
|
@@ -378,6 +392,7 @@ enum llm_tensor {
|
|
378
392
|
LLM_TENSOR_FFN_UP_EXP,
|
379
393
|
LLM_TENSOR_ATTN_Q_NORM,
|
380
394
|
LLM_TENSOR_ATTN_K_NORM,
|
395
|
+
LLM_TENSOR_LAYER_OUT_NORM,
|
381
396
|
};
|
382
397
|
|
383
398
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -494,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
494
509
|
{
|
495
510
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
496
511
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
497
|
-
{ LLM_TENSOR_OUTPUT, "output" },
|
498
512
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
499
513
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
500
514
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
@@ -536,6 +550,38 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
536
550
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
537
551
|
},
|
538
552
|
},
|
553
|
+
{
|
554
|
+
LLM_ARCH_BERT,
|
555
|
+
{
|
556
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
557
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
558
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
559
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
560
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
561
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
562
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
563
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
564
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
565
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
566
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
567
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
568
|
+
},
|
569
|
+
},
|
570
|
+
{
|
571
|
+
LLM_ARCH_NOMIC_BERT,
|
572
|
+
{
|
573
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
574
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
575
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
576
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
577
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
578
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
579
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
580
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
581
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
582
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
583
|
+
},
|
584
|
+
},
|
539
585
|
{
|
540
586
|
LLM_ARCH_BLOOM,
|
541
587
|
{
|
@@ -715,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
715
761
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
716
762
|
},
|
717
763
|
},
|
764
|
+
{
|
765
|
+
LLM_ARCH_GEMMA,
|
766
|
+
{
|
767
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
768
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
769
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
770
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
771
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
772
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
773
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
774
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
777
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
778
|
+
},
|
779
|
+
},
|
718
780
|
{
|
719
781
|
LLM_ARCH_UNKNOWN,
|
720
782
|
{
|
@@ -748,22 +810,37 @@ struct LLM_TN {
|
|
748
810
|
llm_arch arch;
|
749
811
|
|
750
812
|
std::string operator()(llm_tensor tensor) const {
|
813
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
814
|
+
return "__missing__";
|
815
|
+
}
|
751
816
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
752
817
|
}
|
753
818
|
|
754
819
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
820
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
821
|
+
return "__missing__";
|
822
|
+
}
|
755
823
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
756
824
|
}
|
757
825
|
|
758
826
|
std::string operator()(llm_tensor tensor, int bid) const {
|
827
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
828
|
+
return "__missing__";
|
829
|
+
}
|
759
830
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
760
831
|
}
|
761
832
|
|
762
833
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
834
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
835
|
+
return "__missing__";
|
836
|
+
}
|
763
837
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
764
838
|
}
|
765
839
|
|
766
840
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
841
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
842
|
+
return "__missing__";
|
843
|
+
}
|
767
844
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
768
845
|
}
|
769
846
|
};
|
@@ -974,7 +1051,7 @@ struct llama_mmap {
|
|
974
1051
|
int fd = fileno(file->fp);
|
975
1052
|
int flags = MAP_SHARED;
|
976
1053
|
// prefetch/readahead impairs performance on NUMA systems
|
977
|
-
if (numa)
|
1054
|
+
if (numa) { prefetch = 0; }
|
978
1055
|
#ifdef __linux__
|
979
1056
|
// advise the kernel to read the file sequentially (increases readahead)
|
980
1057
|
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
@@ -1440,6 +1517,12 @@ static llama_state g_state;
|
|
1440
1517
|
// available llama models
|
1441
1518
|
enum e_model {
|
1442
1519
|
MODEL_UNKNOWN,
|
1520
|
+
MODEL_17M,
|
1521
|
+
MODEL_22M,
|
1522
|
+
MODEL_33M,
|
1523
|
+
MODEL_109M,
|
1524
|
+
MODEL_137M,
|
1525
|
+
MODEL_335M,
|
1443
1526
|
MODEL_0_5B,
|
1444
1527
|
MODEL_1B,
|
1445
1528
|
MODEL_2B,
|
@@ -1481,6 +1564,7 @@ struct llama_hparams {
|
|
1481
1564
|
uint32_t n_ff;
|
1482
1565
|
uint32_t n_expert = 0;
|
1483
1566
|
uint32_t n_expert_used = 0;
|
1567
|
+
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1484
1568
|
|
1485
1569
|
float f_norm_eps;
|
1486
1570
|
float f_norm_rms_eps;
|
@@ -1490,9 +1574,13 @@ struct llama_hparams {
|
|
1490
1574
|
uint32_t n_yarn_orig_ctx;
|
1491
1575
|
int32_t rope_scaling_type_train;
|
1492
1576
|
|
1493
|
-
float f_clamp_kqv;
|
1494
|
-
float f_max_alibi_bias;
|
1577
|
+
float f_clamp_kqv = 0.0f;
|
1578
|
+
float f_max_alibi_bias = 0.0f;
|
1579
|
+
|
1580
|
+
bool causal_attn = true;
|
1581
|
+
bool need_kq_pos = false;
|
1495
1582
|
|
1583
|
+
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
1496
1584
|
|
1497
1585
|
bool operator!=(const llama_hparams & other) const {
|
1498
1586
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1554,6 +1642,7 @@ struct llama_cparams {
|
|
1554
1642
|
|
1555
1643
|
bool mul_mat_q;
|
1556
1644
|
bool offload_kqv;
|
1645
|
+
bool do_pooling;
|
1557
1646
|
|
1558
1647
|
ggml_backend_sched_eval_callback cb_eval;
|
1559
1648
|
void * cb_eval_user_data;
|
@@ -1569,6 +1658,8 @@ struct llama_layer {
|
|
1569
1658
|
struct ggml_tensor * attn_q_norm_b;
|
1570
1659
|
struct ggml_tensor * attn_k_norm;
|
1571
1660
|
struct ggml_tensor * attn_k_norm_b;
|
1661
|
+
struct ggml_tensor * attn_out_norm;
|
1662
|
+
struct ggml_tensor * attn_out_norm_b;
|
1572
1663
|
|
1573
1664
|
// attention
|
1574
1665
|
struct ggml_tensor * wq;
|
@@ -1587,6 +1678,8 @@ struct llama_layer {
|
|
1587
1678
|
// normalization
|
1588
1679
|
struct ggml_tensor * ffn_norm;
|
1589
1680
|
struct ggml_tensor * ffn_norm_b;
|
1681
|
+
struct ggml_tensor * layer_out_norm;
|
1682
|
+
struct ggml_tensor * layer_out_norm_b;
|
1590
1683
|
|
1591
1684
|
// ff
|
1592
1685
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1720,6 +1813,7 @@ struct llama_model {
|
|
1720
1813
|
llama_vocab vocab;
|
1721
1814
|
|
1722
1815
|
struct ggml_tensor * tok_embd;
|
1816
|
+
struct ggml_tensor * type_embd;
|
1723
1817
|
struct ggml_tensor * pos_embd;
|
1724
1818
|
struct ggml_tensor * tok_norm;
|
1725
1819
|
struct ggml_tensor * tok_norm_b;
|
@@ -1839,8 +1933,6 @@ struct llama_context {
|
|
1839
1933
|
// memory buffers used to evaluate the model
|
1840
1934
|
std::vector<uint8_t> buf_compute_meta;
|
1841
1935
|
ggml_backend_sched_t sched = nullptr;
|
1842
|
-
// allocator for the input tensors
|
1843
|
-
ggml_tallocr * alloc = nullptr;
|
1844
1936
|
|
1845
1937
|
// input tensors
|
1846
1938
|
ggml_backend_buffer_t buf_input = nullptr;
|
@@ -1849,7 +1941,10 @@ struct llama_context {
|
|
1849
1941
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
1850
1942
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1851
1943
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1944
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
1852
1945
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1946
|
+
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
1947
|
+
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
1853
1948
|
|
1854
1949
|
#ifdef GGML_USE_MPI
|
1855
1950
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2448,6 +2543,8 @@ struct llama_model_loader {
|
|
2448
2543
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2449
2544
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2450
2545
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
|
+
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
|
+
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2451
2548
|
default:
|
2452
2549
|
{
|
2453
2550
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2693,13 +2790,7 @@ struct llama_model_loader {
|
|
2693
2790
|
|
2694
2791
|
std::vector<no_init<uint8_t>> read_buf;
|
2695
2792
|
|
2696
|
-
for (
|
2697
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2698
|
-
if (!cur) {
|
2699
|
-
// some tensors may be allocated in a different context
|
2700
|
-
continue;
|
2701
|
-
}
|
2702
|
-
|
2793
|
+
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
2703
2794
|
if (progress_callback) {
|
2704
2795
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
2705
2796
|
return false;
|
@@ -2797,6 +2888,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2797
2888
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2798
2889
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
2799
2890
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2800
2893
|
|
2801
2894
|
default: return "unknown, may not work";
|
2802
2895
|
}
|
@@ -2804,6 +2897,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2804
2897
|
|
2805
2898
|
static const char * llama_model_type_name(e_model type) {
|
2806
2899
|
switch (type) {
|
2900
|
+
case MODEL_22M: return "22M";
|
2901
|
+
case MODEL_33M: return "33M";
|
2902
|
+
case MODEL_109M: return "109M";
|
2903
|
+
case MODEL_137M: return "137M";
|
2904
|
+
case MODEL_0_5B: return "0.5B";
|
2807
2905
|
case MODEL_1B: return "1B";
|
2808
2906
|
case MODEL_2B: return "2B";
|
2809
2907
|
case MODEL_3B: return "3B";
|
@@ -2829,6 +2927,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
2829
2927
|
switch (type) {
|
2830
2928
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2831
2929
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2930
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
2832
2931
|
default: return "unknown";
|
2833
2932
|
}
|
2834
2933
|
}
|
@@ -2972,6 +3071,11 @@ static void llm_load_hparams(
|
|
2972
3071
|
case 40: model.type = e_model::MODEL_13B; break;
|
2973
3072
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2974
3073
|
}
|
3074
|
+
|
3075
|
+
if (model.type == e_model::MODEL_13B) {
|
3076
|
+
// TODO: become GGUF KV parameter
|
3077
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3078
|
+
}
|
2975
3079
|
} break;
|
2976
3080
|
case LLM_ARCH_STARCODER:
|
2977
3081
|
{
|
@@ -2999,6 +3103,41 @@ static void llm_load_hparams(
|
|
2999
3103
|
case 32: model.type = e_model::MODEL_1B; break;
|
3000
3104
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3001
3105
|
}
|
3106
|
+
|
3107
|
+
// TODO: become GGUF KV parameter
|
3108
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3109
|
+
} break;
|
3110
|
+
case LLM_ARCH_BERT:
|
3111
|
+
{
|
3112
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3113
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3114
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3115
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3116
|
+
|
3117
|
+
switch (hparams.n_layer) {
|
3118
|
+
case 3:
|
3119
|
+
model.type = e_model::MODEL_17M; break; // bge-micro
|
3120
|
+
case 6:
|
3121
|
+
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
3122
|
+
case 12:
|
3123
|
+
switch (hparams.n_embd) {
|
3124
|
+
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
3125
|
+
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
3126
|
+
} break;
|
3127
|
+
case 24:
|
3128
|
+
model.type = e_model::MODEL_335M; break; // bge-large
|
3129
|
+
}
|
3130
|
+
} break;
|
3131
|
+
case LLM_ARCH_NOMIC_BERT:
|
3132
|
+
{
|
3133
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3134
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
|
+
|
3138
|
+
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
|
+
model.type = e_model::MODEL_137M;
|
3140
|
+
}
|
3002
3141
|
} break;
|
3003
3142
|
case LLM_ARCH_BLOOM:
|
3004
3143
|
{
|
@@ -3012,11 +3151,12 @@ static void llm_load_hparams(
|
|
3012
3151
|
case 4096: model.type = e_model::MODEL_7B; break;
|
3013
3152
|
} break;
|
3014
3153
|
}
|
3154
|
+
|
3155
|
+
// TODO: become GGUF KV parameter
|
3156
|
+
hparams.f_max_alibi_bias = 8.0f;
|
3015
3157
|
} break;
|
3016
3158
|
case LLM_ARCH_MPT:
|
3017
3159
|
{
|
3018
|
-
hparams.f_clamp_kqv = 0.0f;
|
3019
|
-
|
3020
3160
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3021
3161
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
3022
3162
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
@@ -3114,10 +3254,24 @@ static void llm_load_hparams(
|
|
3114
3254
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3115
3255
|
}
|
3116
3256
|
} break;
|
3257
|
+
case LLM_ARCH_GEMMA:
|
3258
|
+
{
|
3259
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3260
|
+
|
3261
|
+
switch (hparams.n_layer) {
|
3262
|
+
case 18: model.type = e_model::MODEL_2B; break;
|
3263
|
+
case 28: model.type = e_model::MODEL_7B; break;
|
3264
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3265
|
+
}
|
3266
|
+
} break;
|
3117
3267
|
default: (void)0;
|
3118
3268
|
}
|
3119
3269
|
|
3120
3270
|
model.ftype = ml.ftype;
|
3271
|
+
|
3272
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
|
+
hparams.need_kq_pos = true;
|
3274
|
+
}
|
3121
3275
|
}
|
3122
3276
|
|
3123
3277
|
// TODO: This should probably be in llama.h
|
@@ -3204,6 +3358,16 @@ static void llm_load_vocab(
|
|
3204
3358
|
vocab.special_unk_id = -1;
|
3205
3359
|
vocab.special_sep_id = -1;
|
3206
3360
|
vocab.special_pad_id = -1;
|
3361
|
+
} else if (tokenizer_name == "bert") {
|
3362
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
3363
|
+
|
3364
|
+
// default special tokens
|
3365
|
+
vocab.special_bos_id = 101;
|
3366
|
+
vocab.special_eos_id = 102;
|
3367
|
+
vocab.special_unk_id = 100;
|
3368
|
+
vocab.special_sep_id = -1;
|
3369
|
+
vocab.special_pad_id = -1;
|
3370
|
+
vocab.add_space_prefix = false;
|
3207
3371
|
} else {
|
3208
3372
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
3209
3373
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
@@ -3231,7 +3395,14 @@ static void llm_load_vocab(
|
|
3231
3395
|
|
3232
3396
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
3233
3397
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
3234
|
-
|
3398
|
+
try {
|
3399
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
3400
|
+
} catch (const std::exception & e) {
|
3401
|
+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
3402
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3403
|
+
}
|
3404
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3405
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3235
3406
|
} else {
|
3236
3407
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
3237
3408
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
@@ -3544,7 +3715,7 @@ static bool llm_load_tensors(
|
|
3544
3715
|
}
|
3545
3716
|
|
3546
3717
|
// create one context per buffer type
|
3547
|
-
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
3718
|
+
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
3548
3719
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
3549
3720
|
for (auto & it : buft_layer_count) {
|
3550
3721
|
struct ggml_init_params params = {
|
@@ -3569,6 +3740,7 @@ static bool llm_load_tensors(
|
|
3569
3740
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3570
3741
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3571
3742
|
const int64_t n_vocab = hparams.n_vocab;
|
3743
|
+
const int64_t n_vocab_type = hparams.n_vocab_type;
|
3572
3744
|
const int64_t n_ff = hparams.n_ff;
|
3573
3745
|
|
3574
3746
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
@@ -3681,6 +3853,7 @@ static bool llm_load_tensors(
|
|
3681
3853
|
} else {
|
3682
3854
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
3683
3855
|
ml.n_created--; // artificial tensor
|
3856
|
+
ml.size_data += ggml_nbytes(model.output);
|
3684
3857
|
}
|
3685
3858
|
}
|
3686
3859
|
|
@@ -3783,11 +3956,63 @@ static bool llm_load_tensors(
|
|
3783
3956
|
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
3784
3957
|
}
|
3785
3958
|
} break;
|
3959
|
+
case LLM_ARCH_BERT:
|
3960
|
+
case LLM_ARCH_NOMIC_BERT:
|
3961
|
+
{
|
3962
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3963
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
3964
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3965
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3966
|
+
}
|
3967
|
+
|
3968
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3969
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3970
|
+
|
3971
|
+
for (int i = 0; i < n_layer; ++i) {
|
3972
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3973
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3974
|
+
|
3975
|
+
auto & layer = model.layers[i];
|
3976
|
+
|
3977
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3978
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3979
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3980
|
+
|
3981
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3982
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3983
|
+
|
3984
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3985
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3986
|
+
} else {
|
3987
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3988
|
+
}
|
3989
|
+
|
3990
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3991
|
+
|
3992
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
3993
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
3994
|
+
|
3995
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3996
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3997
|
+
|
3998
|
+
if (model.arch == LLM_ARCH_BERT) {
|
3999
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4000
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
4001
|
+
|
4002
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4003
|
+
} else {
|
4004
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4005
|
+
}
|
4006
|
+
|
4007
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4008
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
4009
|
+
}
|
4010
|
+
} break;
|
3786
4011
|
case LLM_ARCH_BLOOM:
|
3787
4012
|
{
|
3788
|
-
model.tok_embd = ml.create_tensor(ctx_input,
|
3789
|
-
model.tok_norm = ml.create_tensor(
|
3790
|
-
model.tok_norm_b = ml.create_tensor(
|
4013
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4014
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
4015
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3791
4016
|
|
3792
4017
|
// output
|
3793
4018
|
{
|
@@ -3828,7 +4053,12 @@ static bool llm_load_tensors(
|
|
3828
4053
|
// output
|
3829
4054
|
{
|
3830
4055
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3831
|
-
model.
|
4056
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
4057
|
+
|
4058
|
+
// same as tok_embd, duplicated to allow offloading
|
4059
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4060
|
+
ml.n_created--; // artificial tensor
|
4061
|
+
ml.size_data += ggml_nbytes(model.output);
|
3832
4062
|
}
|
3833
4063
|
|
3834
4064
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -3837,14 +4067,23 @@ static bool llm_load_tensors(
|
|
3837
4067
|
|
3838
4068
|
auto & layer = model.layers[i];
|
3839
4069
|
|
3840
|
-
layer.attn_norm
|
4070
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4071
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
3841
4072
|
|
3842
4073
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4074
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
4075
|
+
|
3843
4076
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4077
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
3844
4078
|
|
3845
|
-
layer.ffn_norm
|
3846
|
-
layer.
|
3847
|
-
|
4079
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4080
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4081
|
+
|
4082
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
4083
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
4084
|
+
|
4085
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4086
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
3848
4087
|
|
3849
4088
|
// AWQ ScaleActivation layer
|
3850
4089
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
@@ -4157,6 +4396,40 @@ static bool llm_load_tensors(
|
|
4157
4396
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4158
4397
|
}
|
4159
4398
|
} break;
|
4399
|
+
case LLM_ARCH_GEMMA:
|
4400
|
+
{
|
4401
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4402
|
+
|
4403
|
+
// output
|
4404
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4405
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
4406
|
+
ml.n_created--; // artificial tensor
|
4407
|
+
ml.size_data += ggml_nbytes(model.output);
|
4408
|
+
|
4409
|
+
const int64_t n_ff = hparams.n_ff;
|
4410
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4411
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4412
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4413
|
+
|
4414
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
4415
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4416
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4417
|
+
|
4418
|
+
auto & layer = model.layers[i];
|
4419
|
+
|
4420
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4421
|
+
|
4422
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
4423
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
4424
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
4425
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
4426
|
+
|
4427
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4428
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4429
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4430
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4431
|
+
}
|
4432
|
+
} break;
|
4160
4433
|
default:
|
4161
4434
|
throw std::runtime_error("unknown architecture");
|
4162
4435
|
}
|
@@ -4259,9 +4532,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4259
4532
|
|
4260
4533
|
model.hparams.vocab_only = params.vocab_only;
|
4261
4534
|
|
4262
|
-
|
4263
|
-
|
4264
|
-
|
4535
|
+
try {
|
4536
|
+
llm_load_arch(ml, model);
|
4537
|
+
} catch(const std::exception & e) {
|
4538
|
+
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
4539
|
+
}
|
4540
|
+
try {
|
4541
|
+
llm_load_hparams(ml, model);
|
4542
|
+
} catch(const std::exception & e) {
|
4543
|
+
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
4544
|
+
}
|
4545
|
+
try {
|
4546
|
+
llm_load_vocab(ml, model);
|
4547
|
+
} catch(const std::exception & e) {
|
4548
|
+
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
4549
|
+
}
|
4265
4550
|
|
4266
4551
|
llm_load_print_meta(ml, model);
|
4267
4552
|
|
@@ -4578,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4578
4863
|
struct ggml_tensor * wo_b,
|
4579
4864
|
struct ggml_tensor * q_cur,
|
4580
4865
|
struct ggml_tensor * kq_mask,
|
4866
|
+
struct ggml_tensor * kq_pos,
|
4581
4867
|
int64_t n_ctx,
|
4582
4868
|
int32_t n_tokens,
|
4583
4869
|
int32_t n_kv,
|
4584
|
-
float max_alibi_bias,
|
4585
4870
|
float kq_scale,
|
4586
4871
|
const llm_build_cb & cb,
|
4587
4872
|
int il) {
|
@@ -4611,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4611
4896
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4612
4897
|
}
|
4613
4898
|
|
4614
|
-
|
4615
|
-
|
4899
|
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
|
4900
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
|
4901
|
+
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
|
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
4616
4904
|
kq = ggml_scale(ctx, kq, kq_scale);
|
4617
4905
|
cb(kq, "kq_scaled", il);
|
4618
4906
|
|
4619
|
-
|
4620
|
-
|
4621
|
-
// TODO: K-shift is likely not working
|
4622
|
-
// TODO: change to ggml_add
|
4623
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
4624
|
-
cb(kq, "kq_scaled_alibi", il);
|
4625
|
-
}
|
4907
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
4908
|
+
cb(kq, "kq_scaled_alibi", il);
|
4626
4909
|
|
4627
4910
|
kq = ggml_add(ctx, kq, kq_mask);
|
4628
4911
|
cb(kq, "kq_masked", il);
|
4629
4912
|
|
4630
4913
|
kq = ggml_soft_max(ctx, kq);
|
4631
4914
|
cb(kq, "kq_soft_max", il);
|
4632
|
-
} else
|
4633
|
-
|
4915
|
+
} else
|
4916
|
+
#endif
|
4917
|
+
{
|
4918
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
4634
4919
|
cb(kq, "kq_soft_max_ext", il);
|
4635
4920
|
}
|
4636
4921
|
|
@@ -4678,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
|
|
4678
4963
|
struct ggml_tensor * v_cur,
|
4679
4964
|
struct ggml_tensor * q_cur,
|
4680
4965
|
struct ggml_tensor * kq_mask,
|
4966
|
+
struct ggml_tensor * kq_pos,
|
4681
4967
|
int64_t n_ctx,
|
4682
4968
|
int32_t n_tokens,
|
4683
4969
|
int32_t kv_head,
|
4684
4970
|
int32_t n_kv,
|
4685
|
-
float max_alibi_bias,
|
4686
4971
|
float kq_scale,
|
4687
4972
|
const llm_build_cb & cb,
|
4688
4973
|
int il) {
|
@@ -4696,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
|
|
4696
4981
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4697
4982
|
|
4698
4983
|
struct ggml_tensor * cur;
|
4699
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
4700
|
-
|
4701
|
-
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
4984
|
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4985
|
+
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4702
4986
|
cb(cur, "kqv_out", il);
|
4703
4987
|
|
4704
4988
|
return cur;
|
@@ -4739,6 +5023,7 @@ struct llm_build_context {
|
|
4739
5023
|
const int32_t n_orig_ctx;
|
4740
5024
|
|
4741
5025
|
const bool do_rope_shift;
|
5026
|
+
const uint32_t pooling_type;
|
4742
5027
|
|
4743
5028
|
const llm_build_cb & cb;
|
4744
5029
|
|
@@ -4782,6 +5067,7 @@ struct llm_build_context {
|
|
4782
5067
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4783
5068
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4784
5069
|
do_rope_shift (worst_case || kv_self.has_shift),
|
5070
|
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
4785
5071
|
cb (cb),
|
4786
5072
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4787
5073
|
// all initializations should be done in init()
|
@@ -4864,7 +5150,7 @@ struct llm_build_context {
|
|
4864
5150
|
}
|
4865
5151
|
|
4866
5152
|
Qcur = ggml_rope_custom(
|
4867
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
5153
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4868
5154
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
4869
5155
|
ext_factor, attn_factor, beta_fast, beta_slow
|
4870
5156
|
);
|
@@ -4879,7 +5165,7 @@ struct llm_build_context {
|
|
4879
5165
|
|
4880
5166
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
4881
5167
|
model.layers[il].wo, model.layers[il].bo,
|
4882
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5168
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4883
5169
|
cb(cur, "kqv_out", il);
|
4884
5170
|
}
|
4885
5171
|
|
@@ -5009,6 +5295,10 @@ struct llm_build_context {
|
|
5009
5295
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5010
5296
|
cb(KQ_mask, "KQ_mask", -1);
|
5011
5297
|
|
5298
|
+
// positions of the tokens in the KV cache
|
5299
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5301
|
+
|
5012
5302
|
// shift the entire K-cache if needed
|
5013
5303
|
if (do_rope_shift) {
|
5014
5304
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
@@ -5057,12 +5347,9 @@ struct llm_build_context {
|
|
5057
5347
|
cb(Kcur, "Kcur", il);
|
5058
5348
|
|
5059
5349
|
|
5060
|
-
// apply ALiBi for 13B model
|
5061
|
-
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
5062
|
-
|
5063
5350
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5064
5351
|
model.layers[il].wo, NULL,
|
5065
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5352
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5066
5353
|
cb(cur, "kqv_out", il);
|
5067
5354
|
}
|
5068
5355
|
|
@@ -5186,7 +5473,7 @@ struct llm_build_context {
|
|
5186
5473
|
|
5187
5474
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5188
5475
|
model.layers[il].wo, NULL,
|
5189
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5476
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5190
5477
|
cb(cur, "kqv_out", il);
|
5191
5478
|
}
|
5192
5479
|
|
@@ -5285,7 +5572,7 @@ struct llm_build_context {
|
|
5285
5572
|
|
5286
5573
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5287
5574
|
model.layers[il].wo, model.layers[il].bo,
|
5288
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5575
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5289
5576
|
cb(cur, "kqv_out", il);
|
5290
5577
|
}
|
5291
5578
|
|
@@ -5490,7 +5777,7 @@ struct llm_build_context {
|
|
5490
5777
|
|
5491
5778
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5492
5779
|
model.layers[il].wo, model.layers[il].bo,
|
5493
|
-
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5780
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5494
5781
|
cb(cur, "kqv_out", il);
|
5495
5782
|
}
|
5496
5783
|
|
@@ -5552,6 +5839,10 @@ struct llm_build_context {
|
|
5552
5839
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5553
5840
|
cb(KQ_mask, "KQ_mask", -1);
|
5554
5841
|
|
5842
|
+
// positions of the tokens in the KV cache
|
5843
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5844
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5845
|
+
|
5555
5846
|
for (int il = 0; il < n_layer; ++il) {
|
5556
5847
|
struct ggml_tensor * inpSA = inpL;
|
5557
5848
|
|
@@ -5579,7 +5870,7 @@ struct llm_build_context {
|
|
5579
5870
|
|
5580
5871
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5581
5872
|
model.layers[il].wo, NULL,
|
5582
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
5873
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5583
5874
|
cb(cur, "kqv_out", il);
|
5584
5875
|
}
|
5585
5876
|
|
@@ -5625,7 +5916,7 @@ struct llm_build_context {
|
|
5625
5916
|
return gf;
|
5626
5917
|
}
|
5627
5918
|
|
5628
|
-
struct ggml_cgraph *
|
5919
|
+
struct ggml_cgraph * build_bert() {
|
5629
5920
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5630
5921
|
|
5631
5922
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
@@ -5635,34 +5926,58 @@ struct llm_build_context {
|
|
5635
5926
|
struct ggml_tensor * cur;
|
5636
5927
|
struct ggml_tensor * inpL;
|
5637
5928
|
|
5929
|
+
// get input vectors with right size
|
5930
|
+
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5931
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5932
|
+
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5933
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
5934
|
+
|
5935
|
+
// construct input embeddings (token, type, position)
|
5638
5936
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5937
|
+
|
5938
|
+
// token types are hardcoded to zero ("Sentence A")
|
5939
|
+
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
5940
|
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
5941
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5942
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
5943
|
+
}
|
5639
5944
|
cb(inpL, "inp_embd", -1);
|
5640
5945
|
|
5946
|
+
// embed layer norm
|
5947
|
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
5948
|
+
cb(inpL, "inp_norm", -1);
|
5949
|
+
|
5641
5950
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5642
5951
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5643
|
-
cb(KQ_mask, "KQ_mask", -1);
|
5644
|
-
|
5645
|
-
inpL = llm_build_norm(ctx0, inpL, hparams,
|
5646
|
-
model.tok_norm,
|
5647
|
-
model.tok_norm_b,
|
5648
|
-
LLM_NORM, cb, -1);
|
5649
|
-
cb(inpL, "inp_norm", -1);
|
5952
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
5650
5953
|
|
5954
|
+
// iterate layers
|
5651
5955
|
for (int il = 0; il < n_layer; ++il) {
|
5652
|
-
cur =
|
5653
|
-
model.layers[il].attn_norm,
|
5654
|
-
model.layers[il].attn_norm_b,
|
5655
|
-
LLM_NORM, cb, il);
|
5656
|
-
cb(cur, "attn_norm", il);
|
5956
|
+
struct ggml_tensor * cur = inpL;
|
5657
5957
|
|
5658
5958
|
// self-attention
|
5659
|
-
{
|
5959
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5960
|
+
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5961
|
+
cb(Qcur, "Qcur", il);
|
5962
|
+
|
5963
|
+
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
5964
|
+
cb(Kcur, "Kcur", il);
|
5965
|
+
|
5966
|
+
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
5967
|
+
cb(Vcur, "Vcur", il);
|
5968
|
+
|
5969
|
+
// seems like we just need to do this for Q?
|
5970
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5971
|
+
|
5972
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5973
|
+
model.layers[il].wo, model.layers[il].bo,
|
5974
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5975
|
+
cb(cur, "kqv_out", il);
|
5976
|
+
} else {
|
5977
|
+
// compute Q and K and RoPE them
|
5660
5978
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5661
5979
|
cb(cur, "wqkv", il);
|
5662
5980
|
|
5663
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5664
|
-
cb(cur, "bqkv", il);
|
5665
|
-
|
5666
5981
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5667
5982
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5668
5983
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
@@ -5671,54 +5986,82 @@ struct llm_build_context {
|
|
5671
5986
|
cb(Kcur, "Kcur", il);
|
5672
5987
|
cb(Vcur, "Vcur", il);
|
5673
5988
|
|
5674
|
-
Qcur =
|
5989
|
+
Qcur = ggml_rope_custom(
|
5990
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
|
+
);
|
5994
|
+
cb(Qcur, "Qcur", il);
|
5995
|
+
|
5996
|
+
Kcur = ggml_rope_custom(
|
5997
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
|
+
);
|
6001
|
+
cb(Kcur, "Kcur", il);
|
5675
6002
|
|
5676
6003
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5677
6004
|
model.layers[il].wo, model.layers[il].bo,
|
5678
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6005
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5679
6006
|
cb(cur, "kqv_out", il);
|
5680
6007
|
}
|
5681
6008
|
|
5682
|
-
//
|
5683
|
-
|
5684
|
-
cb(ffn_inp, "ffn_inp", il);
|
6009
|
+
// re-add the layer input
|
6010
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5685
6011
|
|
5686
|
-
//
|
5687
|
-
|
5688
|
-
|
5689
|
-
|
5690
|
-
|
5691
|
-
LLM_NORM, cb, il);
|
5692
|
-
cb(cur, "ffn_norm", il);
|
6012
|
+
// attention layer norm
|
6013
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
6014
|
+
|
6015
|
+
struct ggml_tensor * ffn_inp = cur;
|
6016
|
+
cb(ffn_inp, "ffn_inp", il);
|
5693
6017
|
|
6018
|
+
// feed-forward network
|
6019
|
+
if (model.arch == LLM_ARCH_BERT) {
|
5694
6020
|
cur = llm_build_ffn(ctx0, cur,
|
5695
6021
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5696
6022
|
NULL, NULL,
|
5697
6023
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5698
6024
|
NULL,
|
5699
6025
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5700
|
-
|
6026
|
+
} else {
|
6027
|
+
cur = llm_build_ffn(ctx0, cur,
|
6028
|
+
model.layers[il].ffn_up, NULL,
|
6029
|
+
model.layers[il].ffn_gate, NULL,
|
6030
|
+
model.layers[il].ffn_down, NULL,
|
6031
|
+
NULL,
|
6032
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5701
6033
|
}
|
6034
|
+
cb(cur, "ffn_out", il);
|
5702
6035
|
|
5703
|
-
|
5704
|
-
|
6036
|
+
// attentions bypass the intermediate layer
|
6037
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6038
|
+
|
6039
|
+
// output layer norm
|
6040
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
6041
|
+
|
6042
|
+
// input for next layer
|
6043
|
+
inpL = cur;
|
5705
6044
|
}
|
5706
6045
|
|
5707
|
-
|
5708
|
-
|
5709
|
-
model.output_norm_b,
|
5710
|
-
LLM_NORM, cb, -1);
|
5711
|
-
cb(cur, "result_norm", -1);
|
6046
|
+
// final output
|
6047
|
+
cur = inpL;
|
5712
6048
|
|
5713
|
-
|
5714
|
-
|
6049
|
+
// pooling layer
|
6050
|
+
if (pooling_type == LLAMA_POOLING_MEAN) {
|
6051
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6052
|
+
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
6053
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6054
|
+
} else {
|
6055
|
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
6056
|
+
}
|
6057
|
+
cb(cur, "result_embd", -1);
|
5715
6058
|
|
5716
6059
|
ggml_build_forward_expand(gf, cur);
|
5717
6060
|
|
5718
6061
|
return gf;
|
5719
6062
|
}
|
5720
6063
|
|
5721
|
-
struct ggml_cgraph *
|
6064
|
+
struct ggml_cgraph * build_bloom() {
|
5722
6065
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5723
6066
|
|
5724
6067
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
@@ -5735,14 +6078,115 @@ struct llm_build_context {
|
|
5735
6078
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5736
6079
|
cb(KQ_mask, "KQ_mask", -1);
|
5737
6080
|
|
5738
|
-
|
5739
|
-
|
6081
|
+
// positions of the tokens in the KV cache
|
6082
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
6083
|
+
cb(KQ_pos, "KQ_pos", -1);
|
5740
6084
|
|
5741
|
-
|
6085
|
+
inpL = llm_build_norm(ctx0, inpL, hparams,
|
6086
|
+
model.tok_norm,
|
6087
|
+
model.tok_norm_b,
|
6088
|
+
LLM_NORM, cb, -1);
|
6089
|
+
cb(inpL, "inp_norm", -1);
|
6090
|
+
|
6091
|
+
for (int il = 0; il < n_layer; ++il) {
|
6092
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5742
6093
|
model.layers[il].attn_norm,
|
5743
|
-
|
6094
|
+
model.layers[il].attn_norm_b,
|
5744
6095
|
LLM_NORM, cb, il);
|
5745
|
-
cb(
|
6096
|
+
cb(cur, "attn_norm", il);
|
6097
|
+
|
6098
|
+
// self-attention
|
6099
|
+
{
|
6100
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
6101
|
+
cb(cur, "wqkv", il);
|
6102
|
+
|
6103
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
6104
|
+
cb(cur, "bqkv", il);
|
6105
|
+
|
6106
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
6107
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
6108
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
6109
|
+
|
6110
|
+
cb(Qcur, "Qcur", il);
|
6111
|
+
cb(Kcur, "Kcur", il);
|
6112
|
+
cb(Vcur, "Vcur", il);
|
6113
|
+
|
6114
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6115
|
+
|
6116
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6117
|
+
model.layers[il].wo, model.layers[il].bo,
|
6118
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6119
|
+
cb(cur, "kqv_out", il);
|
6120
|
+
}
|
6121
|
+
|
6122
|
+
// Add the input
|
6123
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6124
|
+
cb(ffn_inp, "ffn_inp", il);
|
6125
|
+
|
6126
|
+
// FF
|
6127
|
+
{
|
6128
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6129
|
+
model.layers[il].ffn_norm,
|
6130
|
+
model.layers[il].ffn_norm_b,
|
6131
|
+
LLM_NORM, cb, il);
|
6132
|
+
cb(cur, "ffn_norm", il);
|
6133
|
+
|
6134
|
+
cur = llm_build_ffn(ctx0, cur,
|
6135
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6136
|
+
NULL, NULL,
|
6137
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6138
|
+
NULL,
|
6139
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6140
|
+
cb(cur, "ffn_out", il);
|
6141
|
+
}
|
6142
|
+
|
6143
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
6144
|
+
cb(inpL, "l_out", il);
|
6145
|
+
}
|
6146
|
+
|
6147
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6148
|
+
model.output_norm,
|
6149
|
+
model.output_norm_b,
|
6150
|
+
LLM_NORM, cb, -1);
|
6151
|
+
cb(cur, "result_norm", -1);
|
6152
|
+
|
6153
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6154
|
+
cb(cur, "result_output", -1);
|
6155
|
+
|
6156
|
+
ggml_build_forward_expand(gf, cur);
|
6157
|
+
|
6158
|
+
return gf;
|
6159
|
+
}
|
6160
|
+
|
6161
|
+
struct ggml_cgraph * build_mpt() {
|
6162
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6163
|
+
|
6164
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6165
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6166
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6167
|
+
|
6168
|
+
struct ggml_tensor * cur;
|
6169
|
+
struct ggml_tensor * inpL;
|
6170
|
+
|
6171
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6172
|
+
cb(inpL, "inp_embd", -1);
|
6173
|
+
|
6174
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6175
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6176
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6177
|
+
|
6178
|
+
// positions of the tokens in the KV cache
|
6179
|
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
6180
|
+
cb(KQ_pos, "KQ_pos", -1);
|
6181
|
+
|
6182
|
+
for (int il = 0; il < n_layer; ++il) {
|
6183
|
+
struct ggml_tensor * attn_norm;
|
6184
|
+
|
6185
|
+
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
6186
|
+
model.layers[il].attn_norm,
|
6187
|
+
model.layers[il].attn_norm_b,
|
6188
|
+
LLM_NORM, cb, il);
|
6189
|
+
cb(attn_norm, "attn_norm", il);
|
5746
6190
|
|
5747
6191
|
// self-attention
|
5748
6192
|
{
|
@@ -5751,6 +6195,11 @@ struct llm_build_context {
|
|
5751
6195
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5752
6196
|
cb(cur, "wqkv", il);
|
5753
6197
|
|
6198
|
+
if (model.layers[il].bqkv){
|
6199
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
6200
|
+
cb(cur, "bqkv", il);
|
6201
|
+
}
|
6202
|
+
|
5754
6203
|
if (hparams.f_clamp_kqv > 0.0f) {
|
5755
6204
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
5756
6205
|
cb(cur, "wqkv_clamped", il);
|
@@ -5767,8 +6216,8 @@ struct llm_build_context {
|
|
5767
6216
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5768
6217
|
|
5769
6218
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5770
|
-
model.layers[il].wo,
|
5771
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6219
|
+
model.layers[il].wo, model.layers[il].bo,
|
6220
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5772
6221
|
cb(cur, "kqv_out", il);
|
5773
6222
|
}
|
5774
6223
|
|
@@ -5780,13 +6229,13 @@ struct llm_build_context {
|
|
5780
6229
|
{
|
5781
6230
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5782
6231
|
model.layers[il].ffn_norm,
|
5783
|
-
|
6232
|
+
model.layers[il].ffn_norm_b,
|
5784
6233
|
LLM_NORM, cb, il);
|
5785
6234
|
cb(cur, "ffn_norm", il);
|
5786
6235
|
cur = llm_build_ffn(ctx0, cur,
|
5787
|
-
model.layers[il].ffn_up,
|
6236
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5788
6237
|
NULL, NULL,
|
5789
|
-
model.layers[il].ffn_down,
|
6238
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5790
6239
|
model.layers[il].ffn_act,
|
5791
6240
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5792
6241
|
cb(cur, "ffn_out", il);
|
@@ -5803,7 +6252,7 @@ struct llm_build_context {
|
|
5803
6252
|
|
5804
6253
|
cur = llm_build_norm(ctx0, cur, hparams,
|
5805
6254
|
model.output_norm,
|
5806
|
-
|
6255
|
+
model.output_norm_b,
|
5807
6256
|
LLM_NORM, cb, -1);
|
5808
6257
|
cb(cur, "result_norm", -1);
|
5809
6258
|
|
@@ -5890,7 +6339,7 @@ struct llm_build_context {
|
|
5890
6339
|
|
5891
6340
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5892
6341
|
model.layers[il].wo, NULL,
|
5893
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6342
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5894
6343
|
cb(cur, "kqv_out", il);
|
5895
6344
|
}
|
5896
6345
|
|
@@ -6005,7 +6454,7 @@ struct llm_build_context {
|
|
6005
6454
|
|
6006
6455
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6007
6456
|
model.layers[il].wo, NULL,
|
6008
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6457
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6009
6458
|
cb(cur, "kqv_out", il);
|
6010
6459
|
}
|
6011
6460
|
|
@@ -6126,7 +6575,7 @@ struct llm_build_context {
|
|
6126
6575
|
|
6127
6576
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6128
6577
|
model.layers[il].wo, model.layers[il].bo,
|
6129
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6578
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6130
6579
|
cb(cur, "kqv_out", il);
|
6131
6580
|
}
|
6132
6581
|
|
@@ -6253,7 +6702,7 @@ struct llm_build_context {
|
|
6253
6702
|
|
6254
6703
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6255
6704
|
model.layers[il].wo, model.layers[il].bo,
|
6256
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6705
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6257
6706
|
cb(cur, "kqv_out", il);
|
6258
6707
|
}
|
6259
6708
|
|
@@ -6356,7 +6805,7 @@ struct llm_build_context {
|
|
6356
6805
|
|
6357
6806
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6358
6807
|
model.layers[il].wo, NULL,
|
6359
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6360
6809
|
cb(cur, "kqv_out", il);
|
6361
6810
|
}
|
6362
6811
|
struct ggml_tensor * sa_out = cur;
|
@@ -6455,7 +6904,7 @@ struct llm_build_context {
|
|
6455
6904
|
|
6456
6905
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6457
6906
|
model.layers[il].wo, model.layers[il].bo,
|
6458
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
6907
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6459
6908
|
cb(cur, "kqv_out", il);
|
6460
6909
|
}
|
6461
6910
|
|
@@ -6564,7 +7013,7 @@ struct llm_build_context {
|
|
6564
7013
|
|
6565
7014
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6566
7015
|
model.layers[il].wo, model.layers[il].bo,
|
6567
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7016
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6568
7017
|
cb(cur, "kqv_out", il);
|
6569
7018
|
}
|
6570
7019
|
|
@@ -6682,7 +7131,7 @@ struct llm_build_context {
|
|
6682
7131
|
|
6683
7132
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6684
7133
|
model.layers[il].wo, NULL,
|
6685
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7134
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6686
7135
|
cb(cur, "kqv_out", il);
|
6687
7136
|
}
|
6688
7137
|
|
@@ -6801,7 +7250,7 @@ struct llm_build_context {
|
|
6801
7250
|
|
6802
7251
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6803
7252
|
model.layers[il].wo, model.layers[il].bo,
|
6804
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7253
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6805
7254
|
cb(cur, "kqv_out", il);
|
6806
7255
|
}
|
6807
7256
|
|
@@ -6933,7 +7382,7 @@ struct llm_build_context {
|
|
6933
7382
|
|
6934
7383
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6935
7384
|
model.layers[il].wo, model.layers[il].bo,
|
6936
|
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
7385
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6937
7386
|
cb(cur, "kqv_out", il);
|
6938
7387
|
}
|
6939
7388
|
|
@@ -6992,16 +7441,124 @@ struct llm_build_context {
|
|
6992
7441
|
|
6993
7442
|
return gf;
|
6994
7443
|
}
|
7444
|
+
|
7445
|
+
struct ggml_cgraph * build_gemma() {
|
7446
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7447
|
+
|
7448
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
7449
|
+
|
7450
|
+
struct ggml_tensor * cur;
|
7451
|
+
struct ggml_tensor * inpL;
|
7452
|
+
|
7453
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
7454
|
+
cb(inpL, "inp_embd", -1);
|
7455
|
+
|
7456
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
7457
|
+
cb(inpL, "inp_scaled", -1);
|
7458
|
+
|
7459
|
+
// inp_pos - contains the positions
|
7460
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
7461
|
+
cb(inp_pos, "inp_pos", -1);
|
7462
|
+
|
7463
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7464
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
|
+
cb(KQ_mask, "KQ_mask", -1);
|
7466
|
+
|
7467
|
+
// shift the entire K-cache if needed
|
7468
|
+
if (do_rope_shift) {
|
7469
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
+
}
|
7471
|
+
|
7472
|
+
for (int il = 0; il < n_layer; ++il) {
|
7473
|
+
|
7474
|
+
// norm
|
7475
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7476
|
+
model.layers[il].attn_norm, NULL,
|
7477
|
+
LLM_NORM_RMS, cb, il);
|
7478
|
+
cb(cur, "attn_norm", il);
|
7479
|
+
|
7480
|
+
// self-attention
|
7481
|
+
{
|
7482
|
+
// compute Q and K and RoPE them
|
7483
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
7484
|
+
cb(Qcur, "Qcur", il);
|
7485
|
+
|
7486
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
7487
|
+
cb(Kcur, "Kcur", il);
|
7488
|
+
|
7489
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7490
|
+
cb(Vcur, "Vcur", il);
|
7491
|
+
|
7492
|
+
Qcur = ggml_rope_custom(
|
7493
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
|
+
cb(Qcur, "Qcur", il);
|
7497
|
+
|
7498
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
7499
|
+
cb(Qcur, "Qcur_scaled", il);
|
7500
|
+
|
7501
|
+
Kcur = ggml_rope_custom(
|
7502
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
+
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
|
+
cb(Kcur, "Kcur", il);
|
7506
|
+
|
7507
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
+
model.layers[il].wo, NULL,
|
7509
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7510
|
+
cb(cur, "kqv_out", il);
|
7511
|
+
}
|
7512
|
+
|
7513
|
+
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
7514
|
+
cb(sa_out, "sa_out", il);
|
7515
|
+
|
7516
|
+
cur = llm_build_norm(ctx0, sa_out, hparams,
|
7517
|
+
model.layers[il].ffn_norm, NULL,
|
7518
|
+
LLM_NORM_RMS, cb, il);
|
7519
|
+
cb(cur, "ffn_norm", il);
|
7520
|
+
|
7521
|
+
// feed-forward network
|
7522
|
+
{
|
7523
|
+
cur = llm_build_ffn(ctx0, cur,
|
7524
|
+
model.layers[il].ffn_up, NULL,
|
7525
|
+
model.layers[il].ffn_gate, NULL,
|
7526
|
+
model.layers[il].ffn_down, NULL,
|
7527
|
+
NULL,
|
7528
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
7529
|
+
cb(cur, "ffn_out", il);
|
7530
|
+
}
|
7531
|
+
|
7532
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
7533
|
+
cb(cur, "l_out", il);
|
7534
|
+
|
7535
|
+
// input for next layer
|
7536
|
+
inpL = cur;
|
7537
|
+
}
|
7538
|
+
|
7539
|
+
cur = inpL;
|
7540
|
+
|
7541
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7542
|
+
model.output_norm, NULL,
|
7543
|
+
LLM_NORM_RMS, cb, -1);
|
7544
|
+
cb(cur, "result_norm", -1);
|
7545
|
+
|
7546
|
+
// lm_head
|
7547
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7548
|
+
cb(cur, "result_output", -1);
|
7549
|
+
|
7550
|
+
ggml_build_forward_expand(gf, cur);
|
7551
|
+
|
7552
|
+
return gf;
|
7553
|
+
}
|
6995
7554
|
};
|
6996
7555
|
|
6997
7556
|
static struct ggml_cgraph * llama_build_graph(
|
6998
7557
|
llama_context & lctx,
|
6999
|
-
const llama_batch & batch
|
7558
|
+
const llama_batch & batch,
|
7559
|
+
bool worst_case) {
|
7000
7560
|
const auto & model = lctx.model;
|
7001
7561
|
|
7002
|
-
// check if we should build the worst-case graph (for memory measurement)
|
7003
|
-
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
7004
|
-
|
7005
7562
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7006
7563
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7007
7564
|
if (il >= 0) {
|
@@ -7022,67 +7579,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7022
7579
|
|
7023
7580
|
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
7024
7581
|
|
7025
|
-
//
|
7026
|
-
// set input data
|
7027
|
-
//
|
7028
|
-
|
7029
|
-
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
7030
|
-
if (batch.token) {
|
7031
|
-
const int64_t n_tokens = batch.n_tokens;
|
7032
|
-
|
7033
|
-
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7034
|
-
}
|
7035
|
-
|
7036
|
-
if (batch.embd) {
|
7037
|
-
const int64_t n_embd = llm.n_embd;
|
7038
|
-
const int64_t n_tokens = batch.n_tokens;
|
7039
|
-
|
7040
|
-
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7041
|
-
}
|
7042
|
-
|
7043
|
-
if (batch.pos) {
|
7044
|
-
const int64_t n_tokens = batch.n_tokens;
|
7045
|
-
|
7046
|
-
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7047
|
-
}
|
7048
|
-
|
7049
|
-
{
|
7050
|
-
const int64_t n_kv = llm.n_kv;
|
7051
|
-
const int64_t n_tokens = batch.n_tokens;
|
7052
|
-
|
7053
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7054
|
-
float * data = (float *) lctx.inp_KQ_mask->data;
|
7055
|
-
|
7056
|
-
for (int h = 0; h < 1; ++h) {
|
7057
|
-
for (int j = 0; j < n_tokens; ++j) {
|
7058
|
-
const llama_pos pos = batch.pos[j];
|
7059
|
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7060
|
-
|
7061
|
-
for (int i = 0; i < n_kv; ++i) {
|
7062
|
-
float f;
|
7063
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
7064
|
-
f = -INFINITY;
|
7065
|
-
} else {
|
7066
|
-
f = 0;
|
7067
|
-
}
|
7068
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7069
|
-
}
|
7070
|
-
}
|
7071
|
-
}
|
7072
|
-
}
|
7073
|
-
|
7074
|
-
if (llm.do_rope_shift) {
|
7075
|
-
const int64_t n_ctx = llm.n_ctx;
|
7076
|
-
|
7077
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7078
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7079
|
-
|
7080
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7081
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7082
|
-
}
|
7083
|
-
}
|
7084
|
-
}
|
7085
|
-
|
7086
7582
|
llm.init();
|
7087
7583
|
|
7088
7584
|
switch (model.arch) {
|
@@ -7110,6 +7606,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7110
7606
|
{
|
7111
7607
|
result = llm.build_refact();
|
7112
7608
|
} break;
|
7609
|
+
case LLM_ARCH_BERT:
|
7610
|
+
case LLM_ARCH_NOMIC_BERT:
|
7611
|
+
{
|
7612
|
+
result = llm.build_bert();
|
7613
|
+
} break;
|
7113
7614
|
case LLM_ARCH_BLOOM:
|
7114
7615
|
{
|
7115
7616
|
result = llm.build_bloom();
|
@@ -7158,6 +7659,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7158
7659
|
{
|
7159
7660
|
result = llm.build_minicpm();
|
7160
7661
|
} break;
|
7662
|
+
case LLM_ARCH_GEMMA:
|
7663
|
+
{
|
7664
|
+
result = llm.build_gemma();
|
7665
|
+
} break;
|
7161
7666
|
default:
|
7162
7667
|
GGML_ASSERT(false);
|
7163
7668
|
}
|
@@ -7167,6 +7672,129 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7167
7672
|
return result;
|
7168
7673
|
}
|
7169
7674
|
|
7675
|
+
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7676
|
+
//
|
7677
|
+
// set input data
|
7678
|
+
//
|
7679
|
+
|
7680
|
+
const auto & hparams = lctx.model.hparams;
|
7681
|
+
const auto & cparams = lctx.cparams;
|
7682
|
+
const auto & kv_self = lctx.kv_self;
|
7683
|
+
|
7684
|
+
if (batch.token) {
|
7685
|
+
const int64_t n_tokens = batch.n_tokens;
|
7686
|
+
|
7687
|
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7688
|
+
}
|
7689
|
+
|
7690
|
+
if (batch.embd) {
|
7691
|
+
const int64_t n_embd = hparams.n_embd;
|
7692
|
+
const int64_t n_tokens = batch.n_tokens;
|
7693
|
+
|
7694
|
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7695
|
+
}
|
7696
|
+
|
7697
|
+
if (batch.pos) {
|
7698
|
+
const int64_t n_tokens = batch.n_tokens;
|
7699
|
+
|
7700
|
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7701
|
+
}
|
7702
|
+
|
7703
|
+
{
|
7704
|
+
const int64_t n_kv = kv_self.n;
|
7705
|
+
const int64_t n_tokens = batch.n_tokens;
|
7706
|
+
|
7707
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7708
|
+
|
7709
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7710
|
+
|
7711
|
+
for (int h = 0; h < 1; ++h) {
|
7712
|
+
for (int j = 0; j < n_tokens; ++j) {
|
7713
|
+
const llama_pos pos = batch.pos[j];
|
7714
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7715
|
+
|
7716
|
+
for (int i = 0; i < n_kv; ++i) {
|
7717
|
+
float f;
|
7718
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
7719
|
+
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
7720
|
+
f = -INFINITY;
|
7721
|
+
} else {
|
7722
|
+
f = 0;
|
7723
|
+
}
|
7724
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7725
|
+
}
|
7726
|
+
}
|
7727
|
+
}
|
7728
|
+
}
|
7729
|
+
|
7730
|
+
if (hparams.need_kq_pos) {
|
7731
|
+
const int64_t n_kv = kv_self.n;
|
7732
|
+
|
7733
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
7734
|
+
|
7735
|
+
float * data = (float *) lctx.inp_KQ_pos->data;
|
7736
|
+
|
7737
|
+
for (int i = 0; i < n_kv; ++i) {
|
7738
|
+
data[i] = float(lctx.kv_self.cells[i].pos);
|
7739
|
+
}
|
7740
|
+
}
|
7741
|
+
|
7742
|
+
if (kv_self.has_shift) {
|
7743
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7744
|
+
|
7745
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7746
|
+
|
7747
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7748
|
+
|
7749
|
+
for (int i = 0; i < n_ctx; ++i) {
|
7750
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
7751
|
+
}
|
7752
|
+
}
|
7753
|
+
|
7754
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
7755
|
+
const int64_t n_tokens = batch.n_tokens;
|
7756
|
+
|
7757
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7758
|
+
float * data = (float *) lctx.inp_mean->data;
|
7759
|
+
|
7760
|
+
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7761
|
+
|
7762
|
+
std::vector<uint64_t> sum(n_tokens, 0);
|
7763
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7764
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7765
|
+
sum[seq_id] += 1;
|
7766
|
+
}
|
7767
|
+
|
7768
|
+
std::vector<float> div(n_tokens, 0.0f);
|
7769
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7770
|
+
const uint64_t s = sum[i];
|
7771
|
+
if (s > 0) {
|
7772
|
+
div[i] = 1.0f/float(s);
|
7773
|
+
}
|
7774
|
+
}
|
7775
|
+
|
7776
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7777
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7778
|
+
data[seq_id*n_tokens + i] = div[seq_id];
|
7779
|
+
}
|
7780
|
+
}
|
7781
|
+
|
7782
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
7783
|
+
const int64_t n_tokens = batch.n_tokens;
|
7784
|
+
|
7785
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
7786
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
7787
|
+
|
7788
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7789
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7790
|
+
const llama_pos pos = batch.pos[i];
|
7791
|
+
if (pos == 0) {
|
7792
|
+
data[seq_id] = i;
|
7793
|
+
}
|
7794
|
+
}
|
7795
|
+
}
|
7796
|
+
}
|
7797
|
+
|
7170
7798
|
// decode a batch of tokens by evaluating the transformer
|
7171
7799
|
//
|
7172
7800
|
// - lctx: llama context
|
@@ -7265,17 +7893,22 @@ static int llama_decode_internal(
|
|
7265
7893
|
ggml_backend_sched_reset(lctx.sched);
|
7266
7894
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
7267
7895
|
|
7268
|
-
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
7896
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7269
7897
|
|
7270
7898
|
// the output is always the last tensor in the graph
|
7271
7899
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
7272
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
7273
|
-
|
7274
|
-
// the embeddings could be the second to last tensor, or the third to last tensor
|
7275
7900
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
7276
|
-
if (strcmp(
|
7277
|
-
embeddings
|
7278
|
-
|
7901
|
+
if (strcmp(res->name, "result_output") == 0) {
|
7902
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
7903
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
7904
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
7905
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
7906
|
+
}
|
7907
|
+
} else if (strcmp(res->name, "result_embd") == 0) {
|
7908
|
+
embeddings = res;
|
7909
|
+
res = nullptr;
|
7910
|
+
} else {
|
7911
|
+
GGML_ASSERT(false);
|
7279
7912
|
}
|
7280
7913
|
|
7281
7914
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -7285,7 +7918,9 @@ static int llama_decode_internal(
|
|
7285
7918
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
7286
7919
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
7287
7920
|
// with the BLAS calls. need a better solution
|
7288
|
-
|
7921
|
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
7922
|
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
7923
|
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
7289
7924
|
n_threads = std::min(4, n_threads);
|
7290
7925
|
}
|
7291
7926
|
|
@@ -7303,6 +7938,9 @@ static int llama_decode_internal(
|
|
7303
7938
|
if (lctx.backend_cpu != nullptr) {
|
7304
7939
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7305
7940
|
}
|
7941
|
+
|
7942
|
+
llama_set_inputs(lctx, batch);
|
7943
|
+
|
7306
7944
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
7307
7945
|
|
7308
7946
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
@@ -7342,7 +7980,7 @@ static int llama_decode_internal(
|
|
7342
7980
|
// extract logits
|
7343
7981
|
// TODO: do not compute and extract logits if only embeddings are needed
|
7344
7982
|
// need to update the graphs to skip "result_output"
|
7345
|
-
{
|
7983
|
+
if (res) {
|
7346
7984
|
auto & logits_out = lctx.logits;
|
7347
7985
|
|
7348
7986
|
#ifndef NDEBUG
|
@@ -7386,9 +8024,12 @@ static int llama_decode_internal(
|
|
7386
8024
|
if (!lctx.embedding.empty()) {
|
7387
8025
|
auto & embedding_out = lctx.embedding;
|
7388
8026
|
|
7389
|
-
|
8027
|
+
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
8028
|
+
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
8029
|
+
|
8030
|
+
embedding_out.resize(embd_size);
|
7390
8031
|
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
7391
|
-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(),
|
8032
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
7392
8033
|
ggml_backend_synchronize(embeddings_backend);
|
7393
8034
|
}
|
7394
8035
|
|
@@ -7452,6 +8093,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
7452
8093
|
GGML_ASSERT(false);
|
7453
8094
|
return unicode_to_bytes_bpe(token_data.text);
|
7454
8095
|
}
|
8096
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
8097
|
+
GGML_ASSERT(false);
|
8098
|
+
}
|
7455
8099
|
default:
|
7456
8100
|
GGML_ASSERT(false);
|
7457
8101
|
}
|
@@ -7462,8 +8106,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
7462
8106
|
switch (llama_vocab_get_type(vocab)) {
|
7463
8107
|
case LLAMA_VOCAB_TYPE_SPM: {
|
7464
8108
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
7465
|
-
|
8109
|
+
auto token = vocab.token_to_id.find(buf);
|
8110
|
+
if (token != vocab.token_to_id.end()) {
|
8111
|
+
return (*token).second;
|
8112
|
+
}
|
8113
|
+
// Try to fall back to just the byte as a string
|
8114
|
+
const char buf2[2] = { (char)ch, 0 };
|
8115
|
+
return vocab.token_to_id.at(buf2);
|
7466
8116
|
}
|
8117
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
7467
8118
|
case LLAMA_VOCAB_TYPE_BPE: {
|
7468
8119
|
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
7469
8120
|
}
|
@@ -7509,7 +8160,7 @@ struct llm_bigram_spm {
|
|
7509
8160
|
};
|
7510
8161
|
|
7511
8162
|
struct llm_tokenizer_spm {
|
7512
|
-
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
8163
|
+
llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
|
7513
8164
|
|
7514
8165
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
7515
8166
|
// split string into utf8 chars
|
@@ -7584,6 +8235,7 @@ private:
|
|
7584
8235
|
|
7585
8236
|
if (p == rev_merge.end()) {
|
7586
8237
|
// output any symbols that did not form tokens as bytes.
|
8238
|
+
output.reserve(output.size() + symbol.n);
|
7587
8239
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
7588
8240
|
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
7589
8241
|
output.push_back(token_id);
|
@@ -7934,29 +8586,230 @@ private:
|
|
7934
8586
|
llm_bigram_bpe::queue work_queue;
|
7935
8587
|
};
|
7936
8588
|
|
7937
|
-
|
8589
|
+
struct llm_tokenizer_wpm {
|
8590
|
+
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
8591
|
+
|
8592
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
8593
|
+
auto * token_map = &vocab.token_to_id;
|
8594
|
+
|
8595
|
+
// normalize and split by whitespace
|
8596
|
+
std::vector<std::string> words = preprocess(text);
|
8597
|
+
|
8598
|
+
// bos token prepended already
|
8599
|
+
|
8600
|
+
// find the longest tokens that form the words
|
8601
|
+
for (const std::string &word : words) {
|
8602
|
+
// skip empty words
|
8603
|
+
if (word.size() == 0) {
|
8604
|
+
continue;
|
8605
|
+
}
|
8606
|
+
|
8607
|
+
// prepend phantom space
|
8608
|
+
std::string word1 = "\xe2\x96\x81" + word;
|
8609
|
+
int n = word1.size();
|
8610
|
+
|
8611
|
+
// we're at the start of a new word
|
8612
|
+
int i = 0;
|
8613
|
+
bool match_any = false;
|
8614
|
+
|
8615
|
+
// move through character position in word
|
8616
|
+
while (i < n) {
|
8617
|
+
// loop through possible match length
|
8618
|
+
bool match = false;
|
8619
|
+
for (int j = n; j > i; j--) {
|
8620
|
+
auto it = token_map->find(word1.substr(i, j - i));
|
8621
|
+
if (it != token_map->end()) {
|
8622
|
+
output.push_back(it->second);
|
8623
|
+
match = true;
|
8624
|
+
match_any = true;
|
8625
|
+
i = j;
|
8626
|
+
break;
|
8627
|
+
}
|
8628
|
+
}
|
8629
|
+
|
8630
|
+
// must be an unknown character
|
8631
|
+
if (!match) {
|
8632
|
+
i++;
|
8633
|
+
}
|
8634
|
+
}
|
8635
|
+
|
8636
|
+
// we didn't find any matches for this word
|
8637
|
+
if (!match_any) {
|
8638
|
+
output.push_back(vocab.special_unk_id);
|
8639
|
+
}
|
8640
|
+
}
|
8641
|
+
|
8642
|
+
// append eos token
|
8643
|
+
output.push_back(vocab.special_eos_id);
|
8644
|
+
}
|
8645
|
+
|
8646
|
+
std::vector<std::string> preprocess(const std::string & text) {
|
8647
|
+
std::string ori_str = normalize(text);
|
8648
|
+
uint64_t ori_size = ori_str.size();
|
8649
|
+
|
8650
|
+
// single punct / single symbol / single digit
|
8651
|
+
// baseline: add whitespace on the left and right of punct and chinese characters
|
8652
|
+
std::vector<std::string> words;
|
8653
|
+
std::string new_str = "";
|
8654
|
+
uint64_t i = 0;
|
8655
|
+
while (i < ori_size) {
|
8656
|
+
int utf_char_len = utf8_len(ori_str[i]);
|
8657
|
+
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
|
8658
|
+
new_str += " ";
|
8659
|
+
new_str += ori_str[i];
|
8660
|
+
new_str += " ";
|
8661
|
+
i += 1;
|
8662
|
+
}
|
8663
|
+
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
|
8664
|
+
new_str += " ";
|
8665
|
+
new_str += ori_str.substr(i, 3);
|
8666
|
+
new_str += " ";
|
8667
|
+
i += 3;
|
8668
|
+
}
|
8669
|
+
else {
|
8670
|
+
new_str += ori_str[i];
|
8671
|
+
i += 1;
|
8672
|
+
}
|
8673
|
+
}
|
8674
|
+
|
8675
|
+
// split by whitespace
|
8676
|
+
uint64_t l = 0;
|
8677
|
+
uint64_t r = 0;
|
8678
|
+
while (r < new_str.size()) {
|
8679
|
+
// if is whitespace
|
8680
|
+
if (isspace(new_str[r])) {
|
8681
|
+
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
8682
|
+
l = r + 1;
|
8683
|
+
r = l;
|
8684
|
+
}
|
8685
|
+
else {
|
8686
|
+
r += 1;
|
8687
|
+
}
|
8688
|
+
}
|
8689
|
+
if (r > l) {
|
8690
|
+
words.push_back(new_str.substr(l, (r - l)));
|
8691
|
+
}
|
8692
|
+
return words;
|
8693
|
+
}
|
8694
|
+
|
8695
|
+
std::string normalize(const std::string & text) {
|
8696
|
+
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
8697
|
+
std::string text2 = strip_accents(text);
|
8698
|
+
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
8699
|
+
char c = text2[i];
|
8700
|
+
if (c >= 'A' && c <= 'Z') {
|
8701
|
+
text2[i] = c - 'A' + 'a';
|
8702
|
+
}
|
8703
|
+
}
|
8704
|
+
return text2;
|
8705
|
+
}
|
8706
|
+
|
8707
|
+
bool is_chinese_char(const std::string & str) {
|
8708
|
+
int len = str.length();
|
8709
|
+
unsigned int codepoint = 0;
|
8710
|
+
int num_bytes = 0;
|
8711
|
+
int i = 0;
|
8712
|
+
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8713
|
+
if (ch <= 0x7f) {
|
8714
|
+
codepoint = ch;
|
8715
|
+
num_bytes = 1;
|
8716
|
+
} else if ((ch >> 5) == 0x06) {
|
8717
|
+
codepoint = ch & 0x1f;
|
8718
|
+
num_bytes = 2;
|
8719
|
+
} else if ((ch >> 4) == 0x0e) {
|
8720
|
+
codepoint = ch & 0x0f;
|
8721
|
+
num_bytes = 3;
|
8722
|
+
} else if ((ch >> 3) == 0x1e) {
|
8723
|
+
codepoint = ch & 0x07;
|
8724
|
+
num_bytes = 4;
|
8725
|
+
}
|
8726
|
+
for (int j = 1; j < num_bytes; ++j) {
|
8727
|
+
if (i + j >= len) {
|
8728
|
+
return false; // incomplete UTF-8 character
|
8729
|
+
}
|
8730
|
+
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8731
|
+
if ((next_ch >> 6) != 0x02) {
|
8732
|
+
return false; // invalid trailing byte
|
8733
|
+
}
|
8734
|
+
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8735
|
+
}
|
8736
|
+
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8737
|
+
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8738
|
+
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
8739
|
+
(codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
|
8740
|
+
(codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
|
8741
|
+
(codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
8742
|
+
(codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
|
8743
|
+
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
|
8744
|
+
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
|
8745
|
+
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
|
8746
|
+
return true; // NOLINT
|
8747
|
+
}
|
8748
|
+
return false;
|
8749
|
+
}
|
8750
|
+
|
8751
|
+
std::string strip_accents(const std::string & input_string) {
|
8752
|
+
std::string resultString;
|
8753
|
+
std::map<std::string, char> accent_map = {
|
8754
|
+
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8755
|
+
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8756
|
+
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8757
|
+
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8758
|
+
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8759
|
+
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8760
|
+
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8761
|
+
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8762
|
+
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8763
|
+
};
|
8764
|
+
|
8765
|
+
for (size_t i = 0; i < input_string.length();) {
|
8766
|
+
int len = utf8_len(input_string[i]);
|
8767
|
+
std::string curChar = input_string.substr(i, len);
|
8768
|
+
auto iter = accent_map.find(curChar);
|
8769
|
+
if (iter != accent_map.end()) {
|
8770
|
+
resultString += iter->second;
|
8771
|
+
} else {
|
8772
|
+
resultString += curChar;
|
8773
|
+
}
|
8774
|
+
i += len;
|
8775
|
+
}
|
8776
|
+
|
8777
|
+
return resultString;
|
8778
|
+
}
|
8779
|
+
|
8780
|
+
static size_t utf8_len(char src) {
|
8781
|
+
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8782
|
+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8783
|
+
return lookup[highbits];
|
8784
|
+
}
|
8785
|
+
|
8786
|
+
const llama_vocab & vocab;
|
8787
|
+
};
|
8788
|
+
|
8789
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
7938
8790
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
7939
8791
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
7940
8792
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
7941
8793
|
|
7942
|
-
struct fragment_buffer_variant{
|
8794
|
+
struct fragment_buffer_variant {
|
7943
8795
|
fragment_buffer_variant(llama_vocab::id _token)
|
7944
8796
|
:
|
7945
8797
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
7946
8798
|
token(_token),
|
7947
8799
|
raw_text(_dummy),
|
7948
8800
|
offset(0),
|
7949
|
-
length(0){}
|
8801
|
+
length(0) {}
|
8802
|
+
|
7950
8803
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
7951
8804
|
:
|
7952
8805
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
7953
|
-
token((llama_vocab::id)-1),
|
8806
|
+
token((llama_vocab::id) - 1),
|
7954
8807
|
raw_text(_raw_text),
|
7955
8808
|
offset(_offset),
|
7956
8809
|
length(_length){
|
7957
|
-
GGML_ASSERT(
|
7958
|
-
GGML_ASSERT(
|
7959
|
-
GGML_ASSERT(
|
8810
|
+
GGML_ASSERT(_offset >= 0);
|
8811
|
+
GGML_ASSERT(_length >= 1);
|
8812
|
+
GGML_ASSERT(offset + length <= raw_text.length());
|
7960
8813
|
}
|
7961
8814
|
|
7962
8815
|
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
@@ -7969,8 +8822,7 @@ struct fragment_buffer_variant{
|
|
7969
8822
|
|
7970
8823
|
// #define PRETOKENIZERDEBUG
|
7971
8824
|
|
7972
|
-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
7973
|
-
{
|
8825
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
7974
8826
|
// for each special token
|
7975
8827
|
for (const auto & st: vocab.special_tokens_cache) {
|
7976
8828
|
const auto & special_token = st.first;
|
@@ -8081,17 +8933,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8081
8933
|
}
|
8082
8934
|
|
8083
8935
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
8084
|
-
fragment_buffer.emplace_front(
|
8936
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
8085
8937
|
|
8086
|
-
if (special) tokenizer_st_partition(
|
8938
|
+
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
8087
8939
|
|
8088
8940
|
switch (vocab.type) {
|
8089
8941
|
case LLAMA_VOCAB_TYPE_SPM:
|
8090
8942
|
{
|
8091
|
-
for (const auto & fragment: fragment_buffer)
|
8092
|
-
|
8093
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
8094
|
-
{
|
8943
|
+
for (const auto & fragment : fragment_buffer) {
|
8944
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8095
8945
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
8096
8946
|
|
8097
8947
|
// TODO: It's likely possible to get rid of this string copy entirely
|
@@ -8111,19 +8961,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8111
8961
|
llm_tokenizer_spm tokenizer(vocab);
|
8112
8962
|
llama_escape_whitespace(raw_text);
|
8113
8963
|
tokenizer.tokenize(raw_text, output);
|
8114
|
-
}
|
8115
|
-
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8116
|
-
{
|
8964
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8117
8965
|
output.push_back(fragment.token);
|
8118
8966
|
}
|
8119
8967
|
}
|
8120
8968
|
} break;
|
8121
8969
|
case LLAMA_VOCAB_TYPE_BPE:
|
8122
8970
|
{
|
8123
|
-
for (const auto & fragment: fragment_buffer)
|
8124
|
-
|
8125
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
8126
|
-
{
|
8971
|
+
for (const auto & fragment : fragment_buffer) {
|
8972
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8127
8973
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8128
8974
|
|
8129
8975
|
#ifdef PRETOKENIZERDEBUG
|
@@ -8131,9 +8977,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8131
8977
|
#endif
|
8132
8978
|
llm_tokenizer_bpe tokenizer(vocab);
|
8133
8979
|
tokenizer.tokenize(raw_text, output);
|
8980
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8981
|
+
output.push_back(fragment.token);
|
8134
8982
|
}
|
8135
|
-
|
8136
|
-
|
8983
|
+
}
|
8984
|
+
} break;
|
8985
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
8986
|
+
{
|
8987
|
+
for (const auto & fragment : fragment_buffer) {
|
8988
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8989
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8990
|
+
|
8991
|
+
#ifdef PRETOKENIZERDEBUG
|
8992
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
8993
|
+
#endif
|
8994
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
8995
|
+
tokenizer.tokenize(raw_text, output);
|
8996
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8137
8997
|
output.push_back(fragment.token);
|
8138
8998
|
}
|
8139
8999
|
}
|
@@ -9640,25 +10500,28 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9640
10500
|
return std::make_pair(i_layer, n_layer);
|
9641
10501
|
};
|
9642
10502
|
|
9643
|
-
|
10503
|
+
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
10504
|
+
// with the quantization of the output tensor
|
10505
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
10506
|
+
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
9644
10507
|
int nx = tensor->ne[0];
|
9645
10508
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
9646
10509
|
new_type = GGML_TYPE_Q8_0;
|
9647
10510
|
}
|
9648
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10511
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
9649
10512
|
new_type = GGML_TYPE_Q5_K;
|
9650
10513
|
}
|
9651
10514
|
else if (new_type != GGML_TYPE_Q8_0) {
|
9652
10515
|
new_type = GGML_TYPE_Q6_K;
|
9653
10516
|
}
|
9654
10517
|
} else if (name == "token_embd.weight") {
|
9655
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10518
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
9656
10519
|
new_type = GGML_TYPE_Q2_K;
|
9657
10520
|
}
|
9658
10521
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9659
10522
|
new_type = GGML_TYPE_Q4_K;
|
9660
10523
|
}
|
9661
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
10524
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
9662
10525
|
if (name.find("attn_v.weight") != std::string::npos) {
|
9663
10526
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
9664
10527
|
else new_type = GGML_TYPE_Q2_K;
|
@@ -9668,6 +10531,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9668
10531
|
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
9669
10532
|
++qs.i_ffn_down;
|
9670
10533
|
}
|
10534
|
+
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
10536
|
+
}
|
9671
10537
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
9672
10538
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
9673
10539
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
@@ -9682,6 +10548,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9682
10548
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
9683
10549
|
}
|
9684
10550
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
10552
|
+
new_type = GGML_TYPE_Q5_K;
|
10553
|
+
}
|
9685
10554
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
9686
10555
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
9687
10556
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
@@ -9734,6 +10603,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9734
10603
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
9735
10604
|
}
|
9736
10605
|
}
|
10606
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
10608
|
+
}
|
9737
10609
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
9738
10610
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
9739
10611
|
new_type = GGML_TYPE_Q5_K;
|
@@ -9750,7 +10622,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9750
10622
|
if (arch != LLM_ARCH_FALCON) {
|
9751
10623
|
if (qs.model.hparams.n_expert == 8) {
|
9752
10624
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
9753
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
10625
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
9754
10626
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
9755
10627
|
new_type = GGML_TYPE_Q5_K;
|
9756
10628
|
}
|
@@ -9785,6 +10657,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9785
10657
|
}
|
9786
10658
|
++qs.i_ffn_up;
|
9787
10659
|
}
|
10660
|
+
|
9788
10661
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
9789
10662
|
//}
|
9790
10663
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
@@ -9800,7 +10673,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9800
10673
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
9801
10674
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
9802
10675
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
9803
|
-
new_type == GGML_TYPE_IQ3_XXS) {
|
10676
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
9804
10677
|
int nx = tensor->ne[0];
|
9805
10678
|
int ny = tensor->ne[1];
|
9806
10679
|
if (nx % QK_K != 0) {
|
@@ -9815,8 +10688,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9815
10688
|
case GGML_TYPE_IQ2_XXS:
|
9816
10689
|
case GGML_TYPE_IQ2_XS:
|
9817
10690
|
case GGML_TYPE_IQ3_XXS:
|
9818
|
-
case
|
9819
|
-
case
|
10691
|
+
case GGML_TYPE_IQ1_S:
|
10692
|
+
case GGML_TYPE_Q2_K:
|
10693
|
+
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
9820
10694
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
9821
10695
|
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
9822
10696
|
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
@@ -9844,19 +10718,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9844
10718
|
|
9845
10719
|
// K-quants
|
9846
10720
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
9847
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
10721
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
9848
10722
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
9849
10723
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
9850
10724
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
9851
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
10725
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
9852
10726
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
9853
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
10727
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
9854
10728
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
9855
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
9856
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
9857
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9858
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS
|
9859
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10729
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
10730
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10731
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10732
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10733
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
9860
10736
|
|
9861
10737
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9862
10738
|
}
|
@@ -9986,7 +10862,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9986
10862
|
quantize &= !params->only_copy;
|
9987
10863
|
|
9988
10864
|
// do not quantize expert gating tensors
|
9989
|
-
quantize &= name.
|
10865
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
10866
|
+
|
10867
|
+
// do not quantize positional embeddings and token types (BERT)
|
10868
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
10869
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
9990
10870
|
|
9991
10871
|
enum ggml_type new_type;
|
9992
10872
|
void * new_data;
|
@@ -10026,6 +10906,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10026
10906
|
}
|
10027
10907
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10028
10908
|
new_type == GGML_TYPE_IQ2_XS ||
|
10909
|
+
new_type == GGML_TYPE_IQ1_S ||
|
10029
10910
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10030
10911
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
10031
10912
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -10260,7 +11141,7 @@ static int llama_apply_lora_from_file_internal(
|
|
10260
11141
|
{
|
10261
11142
|
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
10262
11143
|
__func__, ftype);
|
10263
|
-
return
|
11144
|
+
return 1;
|
10264
11145
|
}
|
10265
11146
|
}
|
10266
11147
|
|
@@ -10488,6 +11369,7 @@ struct llama_context_params llama_context_default_params() {
|
|
10488
11369
|
/*.logits_all =*/ false,
|
10489
11370
|
/*.embedding =*/ false,
|
10490
11371
|
/*.offload_kqv =*/ true,
|
11372
|
+
/*.do_pooling =*/ true,
|
10491
11373
|
};
|
10492
11374
|
|
10493
11375
|
return result;
|
@@ -10548,7 +11430,7 @@ bool llama_mlock_supported(void) {
|
|
10548
11430
|
return llama_supports_mlock();
|
10549
11431
|
}
|
10550
11432
|
|
10551
|
-
void llama_backend_init(
|
11433
|
+
void llama_backend_init(void) {
|
10552
11434
|
ggml_time_init();
|
10553
11435
|
|
10554
11436
|
// needed to initialize f16 tables
|
@@ -10558,15 +11440,17 @@ void llama_backend_init(bool numa) {
|
|
10558
11440
|
ggml_free(ctx);
|
10559
11441
|
}
|
10560
11442
|
|
10561
|
-
if (numa) {
|
10562
|
-
ggml_numa_init();
|
10563
|
-
}
|
10564
|
-
|
10565
11443
|
#ifdef GGML_USE_MPI
|
10566
11444
|
ggml_mpi_backend_init();
|
10567
11445
|
#endif
|
10568
11446
|
}
|
10569
11447
|
|
11448
|
+
void llama_numa_init(enum ggml_numa_strategy numa) {
|
11449
|
+
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
11450
|
+
ggml_numa_init(numa);
|
11451
|
+
}
|
11452
|
+
}
|
11453
|
+
|
10570
11454
|
void llama_backend_free(void) {
|
10571
11455
|
#ifdef GGML_USE_MPI
|
10572
11456
|
ggml_mpi_backend_free();
|
@@ -10643,6 +11527,7 @@ struct llama_context * llama_new_context_with_model(
|
|
10643
11527
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
10644
11528
|
cparams.mul_mat_q = params.mul_mat_q;
|
10645
11529
|
cparams.offload_kqv = params.offload_kqv;
|
11530
|
+
cparams.do_pooling = params.do_pooling;
|
10646
11531
|
|
10647
11532
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
10648
11533
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -10790,14 +11675,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10790
11675
|
// resized during inference, reserve maximum
|
10791
11676
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
10792
11677
|
|
10793
|
-
if (params.embedding){
|
11678
|
+
if (params.embedding) {
|
10794
11679
|
ctx->embedding.resize(hparams.n_embd);
|
10795
11680
|
}
|
10796
11681
|
|
10797
11682
|
// graph inputs
|
10798
11683
|
{
|
10799
11684
|
ggml_init_params init_params = {
|
10800
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
11685
|
+
/* .mem_size */ ggml_tensor_overhead()*8,
|
10801
11686
|
/* .mem_buffer */ nullptr,
|
10802
11687
|
/* .no_alloc */ true,
|
10803
11688
|
};
|
@@ -10807,13 +11692,19 @@ struct llama_context * llama_new_context_with_model(
|
|
10807
11692
|
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
10808
11693
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10809
11694
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
11695
|
+
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
10810
11696
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
11697
|
+
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
11698
|
+
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10811
11699
|
|
10812
11700
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
10813
11701
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
10814
11702
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
10815
11703
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
11704
|
+
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
10816
11705
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11706
|
+
ggml_set_name(ctx->inp_mean, "inp_mean");
|
11707
|
+
ggml_set_name(ctx->inp_cls, "inp_cls");
|
10817
11708
|
|
10818
11709
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
10819
11710
|
|
@@ -10839,23 +11730,27 @@ struct llama_context * llama_new_context_with_model(
|
|
10839
11730
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
10840
11731
|
|
10841
11732
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
10842
|
-
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
10843
11733
|
|
10844
11734
|
// build worst-case graph
|
10845
11735
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
10846
11736
|
int n_past = cparams.n_ctx - n_tokens;
|
10847
11737
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
10848
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
11738
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
10849
11739
|
|
10850
11740
|
// initialize scheduler with the worst-case graph
|
10851
|
-
|
10852
|
-
|
11741
|
+
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
|
11742
|
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
11743
|
+
llama_free(ctx);
|
11744
|
+
return nullptr;
|
11745
|
+
}
|
10853
11746
|
|
10854
|
-
for (
|
10855
|
-
|
11747
|
+
for (size_t i = 0; i < ctx->backends.size(); i++) {
|
11748
|
+
ggml_backend_t backend = ctx->backends[i];
|
11749
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
11750
|
+
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
10856
11751
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
10857
|
-
|
10858
|
-
|
11752
|
+
ggml_backend_buft_name(buft),
|
11753
|
+
size / 1024.0 / 1024.0);
|
10859
11754
|
}
|
10860
11755
|
|
10861
11756
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
@@ -11301,18 +12196,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
11301
12196
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
11302
12197
|
|
11303
12198
|
if (kv_buf_size) {
|
11304
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
11305
|
-
|
11306
12199
|
std::vector<uint8_t> tmp_buf;
|
11307
12200
|
for (int il = 0; il < (int) n_layer; ++il) {
|
11308
|
-
|
12201
|
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12202
|
+
tmp_buf.resize(k_size);
|
11309
12203
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
11310
12204
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
11311
12205
|
|
11312
12206
|
// v is not contiguous, copy row by row
|
11313
|
-
|
12207
|
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12208
|
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12209
|
+
tmp_buf.resize(v_row_size);
|
11314
12210
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
11315
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*
|
12211
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
11316
12212
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
11317
12213
|
}
|
11318
12214
|
}
|
@@ -11414,17 +12310,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
11414
12310
|
if (kv_buf_size) {
|
11415
12311
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
11416
12312
|
|
11417
|
-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
11418
|
-
|
11419
12313
|
for (int il = 0; il < (int) n_layer; ++il) {
|
11420
|
-
size_t k_size =
|
12314
|
+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
11421
12315
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
11422
12316
|
inp += k_size;
|
11423
12317
|
|
11424
12318
|
// v is not contiguous, copy row by row
|
11425
|
-
size_t v_row_size =
|
12319
|
+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12320
|
+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
11426
12321
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
11427
|
-
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*
|
12322
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
11428
12323
|
inp += v_row_size;
|
11429
12324
|
}
|
11430
12325
|
}
|
@@ -11660,6 +12555,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
11660
12555
|
return ctx->embedding.data();
|
11661
12556
|
}
|
11662
12557
|
|
12558
|
+
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12559
|
+
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
12560
|
+
}
|
12561
|
+
|
11663
12562
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
11664
12563
|
return model->vocab.id_to_token[token].text.c_str();
|
11665
12564
|
}
|
@@ -11744,6 +12643,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
11744
12643
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
11745
12644
|
if (0 <= token && token < llama_n_vocab(model)) {
|
11746
12645
|
switch (llama_vocab_get_type(model->vocab)) {
|
12646
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
11747
12647
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11748
12648
|
// NOTE: we accept all unsupported token types,
|
11749
12649
|
// suppressing them like CONTROL tokens.
|
@@ -11809,6 +12709,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
11809
12709
|
return 0;
|
11810
12710
|
}
|
11811
12711
|
|
12712
|
+
// trim whitespace from the beginning and end of a string
|
12713
|
+
static std::string trim(const std::string & str) {
|
12714
|
+
size_t start = 0;
|
12715
|
+
size_t end = str.size();
|
12716
|
+
while (start < end && isspace(str[start])) {
|
12717
|
+
start += 1;
|
12718
|
+
}
|
12719
|
+
while (end > start && isspace(str[end - 1])) {
|
12720
|
+
end -= 1;
|
12721
|
+
}
|
12722
|
+
return str.substr(start, end - start);
|
12723
|
+
}
|
12724
|
+
|
12725
|
+
// Simple version of "llama_apply_chat_template" that only works with strings
|
12726
|
+
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
12727
|
+
static int32_t llama_chat_apply_template_internal(
|
12728
|
+
const std::string & tmpl,
|
12729
|
+
const std::vector<const llama_chat_message *> & chat,
|
12730
|
+
std::string & dest, bool add_ass) {
|
12731
|
+
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
12732
|
+
std::stringstream ss;
|
12733
|
+
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
12734
|
+
// chatml template
|
12735
|
+
for (auto message : chat) {
|
12736
|
+
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
12737
|
+
}
|
12738
|
+
if (add_ass) {
|
12739
|
+
ss << "<|im_start|>assistant\n";
|
12740
|
+
}
|
12741
|
+
} else if (tmpl.find("[INST]") != std::string::npos) {
|
12742
|
+
// llama2 template and its variants
|
12743
|
+
// [variant] support system message
|
12744
|
+
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
12745
|
+
// [variant] space before + after response
|
12746
|
+
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
12747
|
+
// [variant] add BOS inside history
|
12748
|
+
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
12749
|
+
// [variant] trim spaces from the input message
|
12750
|
+
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
12751
|
+
// construct the prompt
|
12752
|
+
bool is_inside_turn = true; // skip BOS at the beginning
|
12753
|
+
ss << "[INST] ";
|
12754
|
+
for (auto message : chat) {
|
12755
|
+
std::string content = strip_message ? trim(message->content) : message->content;
|
12756
|
+
std::string role(message->role);
|
12757
|
+
if (!is_inside_turn) {
|
12758
|
+
is_inside_turn = true;
|
12759
|
+
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
12760
|
+
}
|
12761
|
+
if (role == "system") {
|
12762
|
+
if (support_system_message) {
|
12763
|
+
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
12764
|
+
} else {
|
12765
|
+
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
12766
|
+
ss << content << "\n";
|
12767
|
+
}
|
12768
|
+
} else if (role == "user") {
|
12769
|
+
ss << content << " [/INST]";
|
12770
|
+
} else {
|
12771
|
+
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
12772
|
+
is_inside_turn = false;
|
12773
|
+
}
|
12774
|
+
}
|
12775
|
+
// llama2 templates seem to not care about "add_generation_prompt"
|
12776
|
+
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
12777
|
+
// zephyr template
|
12778
|
+
for (auto message : chat) {
|
12779
|
+
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
12780
|
+
}
|
12781
|
+
if (add_ass) {
|
12782
|
+
ss << "<|assistant|>\n";
|
12783
|
+
}
|
12784
|
+
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
12785
|
+
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
12786
|
+
for (auto message : chat) {
|
12787
|
+
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
12788
|
+
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
12789
|
+
}
|
12790
|
+
if (add_ass) {
|
12791
|
+
ss << "<s>assistant\n";
|
12792
|
+
}
|
12793
|
+
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
12794
|
+
// google/gemma-7b-it
|
12795
|
+
std::string system_prompt = "";
|
12796
|
+
for (auto message : chat) {
|
12797
|
+
std::string role(message->role);
|
12798
|
+
if (role == "system") {
|
12799
|
+
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
12800
|
+
system_prompt = trim(message->content);
|
12801
|
+
continue;
|
12802
|
+
}
|
12803
|
+
// in gemma, "assistant" is "model"
|
12804
|
+
role = role == "assistant" ? "model" : message->role;
|
12805
|
+
ss << "<start_of_turn>" << role << "\n";
|
12806
|
+
if (!system_prompt.empty() && role != "model") {
|
12807
|
+
ss << system_prompt << "\n\n";
|
12808
|
+
system_prompt = "";
|
12809
|
+
}
|
12810
|
+
ss << trim(message->content) << "<end_of_turn>\n";
|
12811
|
+
}
|
12812
|
+
if (add_ass) {
|
12813
|
+
ss << "<start_of_turn>model\n";
|
12814
|
+
}
|
12815
|
+
} else {
|
12816
|
+
// template not supported
|
12817
|
+
return -1;
|
12818
|
+
}
|
12819
|
+
dest = ss.str();
|
12820
|
+
return dest.size();
|
12821
|
+
}
|
12822
|
+
|
12823
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
12824
|
+
const struct llama_model * model,
|
12825
|
+
const char * tmpl,
|
12826
|
+
const struct llama_chat_message * chat,
|
12827
|
+
size_t n_msg,
|
12828
|
+
bool add_ass,
|
12829
|
+
char * buf,
|
12830
|
+
int32_t length) {
|
12831
|
+
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
12832
|
+
if (tmpl == nullptr) {
|
12833
|
+
GGML_ASSERT(model != nullptr);
|
12834
|
+
// load template from model
|
12835
|
+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
12836
|
+
std::string template_key = "tokenizer.chat_template";
|
12837
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
12838
|
+
if (res < 0) {
|
12839
|
+
// worst case: there is no information about template, we will use chatml by default
|
12840
|
+
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
12841
|
+
} else {
|
12842
|
+
curr_tmpl = std::string(model_template.data(), model_template.size());
|
12843
|
+
}
|
12844
|
+
}
|
12845
|
+
// format the chat to string
|
12846
|
+
std::vector<const llama_chat_message *> chat_vec;
|
12847
|
+
chat_vec.resize(n_msg);
|
12848
|
+
for (size_t i = 0; i < n_msg; i++) {
|
12849
|
+
chat_vec[i] = &chat[i];
|
12850
|
+
}
|
12851
|
+
std::string formatted_chat;
|
12852
|
+
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
12853
|
+
if (res < 0) {
|
12854
|
+
return res;
|
12855
|
+
}
|
12856
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
12857
|
+
return res;
|
12858
|
+
}
|
12859
|
+
|
11812
12860
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
11813
12861
|
struct llama_timings result = {
|
11814
12862
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
@@ -11867,6 +12915,7 @@ const char * llama_print_system_info(void) {
|
|
11867
12915
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
11868
12916
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
11869
12917
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
12918
|
+
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
11870
12919
|
|
11871
12920
|
return s.c_str();
|
11872
12921
|
}
|