llama_cpp 0.12.5 → 0.12.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -196,6 +196,7 @@ enum llm_arch {
|
|
196
196
|
LLM_ARCH_STARCODER,
|
197
197
|
LLM_ARCH_PERSIMMON,
|
198
198
|
LLM_ARCH_REFACT,
|
199
|
+
LLM_ARCH_BERT,
|
199
200
|
LLM_ARCH_BLOOM,
|
200
201
|
LLM_ARCH_STABLELM,
|
201
202
|
LLM_ARCH_QWEN,
|
@@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
220
221
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
221
222
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
222
223
|
{ LLM_ARCH_REFACT, "refact" },
|
224
|
+
{ LLM_ARCH_BERT, "bert" },
|
223
225
|
{ LLM_ARCH_BLOOM, "bloom" },
|
224
226
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
225
227
|
{ LLM_ARCH_QWEN, "qwen" },
|
@@ -252,6 +254,7 @@ enum llm_kv {
|
|
252
254
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
253
255
|
LLM_KV_EXPERT_COUNT,
|
254
256
|
LLM_KV_EXPERT_USED_COUNT,
|
257
|
+
LLM_KV_POOLING_LAYER,
|
255
258
|
|
256
259
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
257
260
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -261,6 +264,7 @@ enum llm_kv {
|
|
261
264
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
262
265
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
263
266
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
267
|
+
LLM_KV_ATTENTION_CAUSAL,
|
264
268
|
|
265
269
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
266
270
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -273,6 +277,7 @@ enum llm_kv {
|
|
273
277
|
LLM_KV_TOKENIZER_MODEL,
|
274
278
|
LLM_KV_TOKENIZER_LIST,
|
275
279
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
280
|
+
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
276
281
|
LLM_KV_TOKENIZER_SCORES,
|
277
282
|
LLM_KV_TOKENIZER_MERGES,
|
278
283
|
LLM_KV_TOKENIZER_BOS_ID,
|
@@ -307,6 +312,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
307
312
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
308
313
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
309
314
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
315
|
+
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
|
310
316
|
|
311
317
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
312
318
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -316,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
316
322
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
317
323
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
318
324
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
325
|
+
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
319
326
|
|
320
327
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
321
328
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -328,6 +335,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
328
335
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
329
336
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
330
337
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
338
|
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
331
339
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
332
340
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
333
341
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
@@ -355,6 +363,7 @@ struct LLM_KV {
|
|
355
363
|
enum llm_tensor {
|
356
364
|
LLM_TENSOR_TOKEN_EMBD,
|
357
365
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
366
|
+
LLM_TENSOR_TOKEN_TYPES,
|
358
367
|
LLM_TENSOR_POS_EMBD,
|
359
368
|
LLM_TENSOR_OUTPUT,
|
360
369
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -536,6 +545,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
536
545
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
537
546
|
},
|
538
547
|
},
|
548
|
+
{
|
549
|
+
LLM_ARCH_BERT,
|
550
|
+
{
|
551
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
552
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
553
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
554
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
555
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
|
556
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
557
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
558
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
559
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
560
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
|
561
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
562
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
563
|
+
},
|
564
|
+
},
|
539
565
|
{
|
540
566
|
LLM_ARCH_BLOOM,
|
541
567
|
{
|
@@ -748,22 +774,37 @@ struct LLM_TN {
|
|
748
774
|
llm_arch arch;
|
749
775
|
|
750
776
|
std::string operator()(llm_tensor tensor) const {
|
777
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
778
|
+
return "__missing__";
|
779
|
+
}
|
751
780
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
752
781
|
}
|
753
782
|
|
754
783
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
784
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
785
|
+
return "__missing__";
|
786
|
+
}
|
755
787
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
756
788
|
}
|
757
789
|
|
758
790
|
std::string operator()(llm_tensor tensor, int bid) const {
|
791
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
792
|
+
return "__missing__";
|
793
|
+
}
|
759
794
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
760
795
|
}
|
761
796
|
|
762
797
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
798
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
799
|
+
return "__missing__";
|
800
|
+
}
|
763
801
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
764
802
|
}
|
765
803
|
|
766
804
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
805
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
806
|
+
return "__missing__";
|
807
|
+
}
|
767
808
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
768
809
|
}
|
769
810
|
};
|
@@ -1440,6 +1481,11 @@ static llama_state g_state;
|
|
1440
1481
|
// available llama models
|
1441
1482
|
enum e_model {
|
1442
1483
|
MODEL_UNKNOWN,
|
1484
|
+
MODEL_17M,
|
1485
|
+
MODEL_22M,
|
1486
|
+
MODEL_33M,
|
1487
|
+
MODEL_109M,
|
1488
|
+
MODEL_335M,
|
1443
1489
|
MODEL_0_5B,
|
1444
1490
|
MODEL_1B,
|
1445
1491
|
MODEL_2B,
|
@@ -1481,6 +1527,7 @@ struct llama_hparams {
|
|
1481
1527
|
uint32_t n_ff;
|
1482
1528
|
uint32_t n_expert = 0;
|
1483
1529
|
uint32_t n_expert_used = 0;
|
1530
|
+
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1484
1531
|
|
1485
1532
|
float f_norm_eps;
|
1486
1533
|
float f_norm_rms_eps;
|
@@ -1493,6 +1540,9 @@ struct llama_hparams {
|
|
1493
1540
|
float f_clamp_kqv;
|
1494
1541
|
float f_max_alibi_bias;
|
1495
1542
|
|
1543
|
+
bool causal_attn = true;
|
1544
|
+
bool pooling_layer = false;
|
1545
|
+
|
1496
1546
|
|
1497
1547
|
bool operator!=(const llama_hparams & other) const {
|
1498
1548
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1554,6 +1604,7 @@ struct llama_cparams {
|
|
1554
1604
|
|
1555
1605
|
bool mul_mat_q;
|
1556
1606
|
bool offload_kqv;
|
1607
|
+
bool do_pooling;
|
1557
1608
|
|
1558
1609
|
ggml_backend_sched_eval_callback cb_eval;
|
1559
1610
|
void * cb_eval_user_data;
|
@@ -1720,6 +1771,7 @@ struct llama_model {
|
|
1720
1771
|
llama_vocab vocab;
|
1721
1772
|
|
1722
1773
|
struct ggml_tensor * tok_embd;
|
1774
|
+
struct ggml_tensor * type_embd;
|
1723
1775
|
struct ggml_tensor * pos_embd;
|
1724
1776
|
struct ggml_tensor * tok_norm;
|
1725
1777
|
struct ggml_tensor * tok_norm_b;
|
@@ -1839,8 +1891,6 @@ struct llama_context {
|
|
1839
1891
|
// memory buffers used to evaluate the model
|
1840
1892
|
std::vector<uint8_t> buf_compute_meta;
|
1841
1893
|
ggml_backend_sched_t sched = nullptr;
|
1842
|
-
// allocator for the input tensors
|
1843
|
-
ggml_tallocr * alloc = nullptr;
|
1844
1894
|
|
1845
1895
|
// input tensors
|
1846
1896
|
ggml_backend_buffer_t buf_input = nullptr;
|
@@ -1850,6 +1900,7 @@ struct llama_context {
|
|
1850
1900
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1851
1901
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1852
1902
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1903
|
+
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
|
1853
1904
|
|
1854
1905
|
#ifdef GGML_USE_MPI
|
1855
1906
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2829,6 +2880,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
2829
2880
|
switch (type) {
|
2830
2881
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2831
2882
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2883
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
2832
2884
|
default: return "unknown";
|
2833
2885
|
}
|
2834
2886
|
}
|
@@ -3000,6 +3052,27 @@ static void llm_load_hparams(
|
|
3000
3052
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3001
3053
|
}
|
3002
3054
|
} break;
|
3055
|
+
case LLM_ARCH_BERT:
|
3056
|
+
{
|
3057
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3058
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3059
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3060
|
+
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
3061
|
+
|
3062
|
+
switch (hparams.n_layer) {
|
3063
|
+
case 3:
|
3064
|
+
model.type = e_model::MODEL_17M; break; // bge-micro
|
3065
|
+
case 6:
|
3066
|
+
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
3067
|
+
case 12:
|
3068
|
+
switch (hparams.n_embd) {
|
3069
|
+
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
3070
|
+
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
3071
|
+
} break;
|
3072
|
+
case 24:
|
3073
|
+
model.type = e_model::MODEL_335M; break; // bge-large
|
3074
|
+
}
|
3075
|
+
} break;
|
3003
3076
|
case LLM_ARCH_BLOOM:
|
3004
3077
|
{
|
3005
3078
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3204,6 +3277,16 @@ static void llm_load_vocab(
|
|
3204
3277
|
vocab.special_unk_id = -1;
|
3205
3278
|
vocab.special_sep_id = -1;
|
3206
3279
|
vocab.special_pad_id = -1;
|
3280
|
+
} else if (tokenizer_name == "bert") {
|
3281
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
3282
|
+
|
3283
|
+
// default special tokens
|
3284
|
+
vocab.special_bos_id = 101;
|
3285
|
+
vocab.special_eos_id = 102;
|
3286
|
+
vocab.special_unk_id = 100;
|
3287
|
+
vocab.special_sep_id = -1;
|
3288
|
+
vocab.special_pad_id = -1;
|
3289
|
+
vocab.add_space_prefix = false;
|
3207
3290
|
} else {
|
3208
3291
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
3209
3292
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
@@ -3231,7 +3314,14 @@ static void llm_load_vocab(
|
|
3231
3314
|
|
3232
3315
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
3233
3316
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
3234
|
-
|
3317
|
+
try {
|
3318
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
3319
|
+
} catch (const std::exception & e) {
|
3320
|
+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
3321
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3322
|
+
}
|
3323
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3324
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3235
3325
|
} else {
|
3236
3326
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
3237
3327
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
@@ -3569,6 +3659,7 @@ static bool llm_load_tensors(
|
|
3569
3659
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3570
3660
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3571
3661
|
const int64_t n_vocab = hparams.n_vocab;
|
3662
|
+
const int64_t n_vocab_type = hparams.n_vocab_type;
|
3572
3663
|
const int64_t n_ff = hparams.n_ff;
|
3573
3664
|
|
3574
3665
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
@@ -3783,11 +3874,50 @@ static bool llm_load_tensors(
|
|
3783
3874
|
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
3784
3875
|
}
|
3785
3876
|
} break;
|
3786
|
-
case
|
3877
|
+
case LLM_ARCH_BERT:
|
3787
3878
|
{
|
3788
3879
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3789
|
-
model.
|
3790
|
-
model.
|
3880
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
3881
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3882
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3883
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3884
|
+
|
3885
|
+
for (int i = 0; i < n_layer; ++i) {
|
3886
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3887
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3888
|
+
|
3889
|
+
auto & layer = model.layers[i];
|
3890
|
+
|
3891
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3892
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3893
|
+
|
3894
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3895
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3896
|
+
|
3897
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3898
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3899
|
+
|
3900
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3901
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3902
|
+
|
3903
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3904
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3905
|
+
|
3906
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3907
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3908
|
+
|
3909
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3910
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3911
|
+
|
3912
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3913
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3914
|
+
}
|
3915
|
+
} break;
|
3916
|
+
case LLM_ARCH_BLOOM:
|
3917
|
+
{
|
3918
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3919
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3920
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3791
3921
|
|
3792
3922
|
// output
|
3793
3923
|
{
|
@@ -4259,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4259
4389
|
|
4260
4390
|
model.hparams.vocab_only = params.vocab_only;
|
4261
4391
|
|
4262
|
-
|
4263
|
-
|
4264
|
-
|
4392
|
+
try {
|
4393
|
+
llm_load_arch(ml, model);
|
4394
|
+
} catch(const std::exception & e) {
|
4395
|
+
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
4396
|
+
}
|
4397
|
+
try {
|
4398
|
+
llm_load_hparams(ml, model);
|
4399
|
+
} catch(const std::exception & e) {
|
4400
|
+
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
4401
|
+
}
|
4402
|
+
try {
|
4403
|
+
llm_load_vocab(ml, model);
|
4404
|
+
} catch(const std::exception & e) {
|
4405
|
+
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
4406
|
+
}
|
4265
4407
|
|
4266
4408
|
llm_load_print_meta(ml, model);
|
4267
4409
|
|
@@ -4739,6 +4881,7 @@ struct llm_build_context {
|
|
4739
4881
|
const int32_t n_orig_ctx;
|
4740
4882
|
|
4741
4883
|
const bool do_rope_shift;
|
4884
|
+
const bool do_pooling;
|
4742
4885
|
|
4743
4886
|
const llm_build_cb & cb;
|
4744
4887
|
|
@@ -4782,6 +4925,7 @@ struct llm_build_context {
|
|
4782
4925
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4783
4926
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4784
4927
|
do_rope_shift (worst_case || kv_self.has_shift),
|
4928
|
+
do_pooling (hparams.pooling_layer && cparams.do_pooling),
|
4785
4929
|
cb (cb),
|
4786
4930
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4787
4931
|
// all initializations should be done in init()
|
@@ -5625,6 +5769,103 @@ struct llm_build_context {
|
|
5625
5769
|
return gf;
|
5626
5770
|
}
|
5627
5771
|
|
5772
|
+
struct ggml_cgraph * build_bert() {
|
5773
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5774
|
+
|
5775
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5776
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5777
|
+
|
5778
|
+
struct ggml_tensor * cur;
|
5779
|
+
struct ggml_tensor * inpL;
|
5780
|
+
|
5781
|
+
// get input vectors with right size
|
5782
|
+
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5783
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5784
|
+
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
|
5785
|
+
|
5786
|
+
// construct input embeddings (token, type, position)
|
5787
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5788
|
+
|
5789
|
+
// token types are hardcoded to zero ("Sentence A")
|
5790
|
+
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
5791
|
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
5792
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
5793
|
+
cb(inpL, "inp_embd", -1);
|
5794
|
+
|
5795
|
+
// embed layer norm
|
5796
|
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
5797
|
+
cb(inpL, "inp_norm", -1);
|
5798
|
+
|
5799
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5800
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5801
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
5802
|
+
|
5803
|
+
// iterate layers
|
5804
|
+
for (int il = 0; il < n_layer; ++il) {
|
5805
|
+
struct ggml_tensor * cur = inpL;
|
5806
|
+
|
5807
|
+
// self-attention
|
5808
|
+
{
|
5809
|
+
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5810
|
+
cb(Qcur, "Qcur", il);
|
5811
|
+
|
5812
|
+
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
5813
|
+
cb(Kcur, "Kcur", il);
|
5814
|
+
|
5815
|
+
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
5816
|
+
cb(Vcur, "Vcur", il);
|
5817
|
+
|
5818
|
+
// seems like we just need to do this for Q?
|
5819
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5820
|
+
|
5821
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5822
|
+
model.layers[il].wo, model.layers[il].bo,
|
5823
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5824
|
+
cb(cur, "kqv_out", il);
|
5825
|
+
}
|
5826
|
+
|
5827
|
+
// re-add the layer input
|
5828
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5829
|
+
|
5830
|
+
// attention layer norm
|
5831
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
|
5832
|
+
|
5833
|
+
struct ggml_tensor * ffn_inp = cur;
|
5834
|
+
cb(ffn_inp, "ffn_inp", il);
|
5835
|
+
|
5836
|
+
// feed-forward network
|
5837
|
+
cur = llm_build_ffn(ctx0, cur,
|
5838
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5839
|
+
NULL, NULL,
|
5840
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5841
|
+
NULL,
|
5842
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5843
|
+
cb(cur, "ffn_out", il);
|
5844
|
+
|
5845
|
+
// attentions bypass the intermediate layer
|
5846
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5847
|
+
|
5848
|
+
// output layer norm
|
5849
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
|
5850
|
+
|
5851
|
+
// input for next layer
|
5852
|
+
inpL = cur;
|
5853
|
+
}
|
5854
|
+
|
5855
|
+
// final output
|
5856
|
+
cur = inpL;
|
5857
|
+
|
5858
|
+
// pooling layer
|
5859
|
+
if (do_pooling) {
|
5860
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
|
5861
|
+
}
|
5862
|
+
cb(cur, "result_embd", -1);
|
5863
|
+
|
5864
|
+
ggml_build_forward_expand(gf, cur);
|
5865
|
+
|
5866
|
+
return gf;
|
5867
|
+
}
|
5868
|
+
|
5628
5869
|
struct ggml_cgraph * build_bloom() {
|
5629
5870
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5630
5871
|
|
@@ -6996,12 +7237,10 @@ struct llm_build_context {
|
|
6996
7237
|
|
6997
7238
|
static struct ggml_cgraph * llama_build_graph(
|
6998
7239
|
llama_context & lctx,
|
6999
|
-
const llama_batch & batch
|
7240
|
+
const llama_batch & batch,
|
7241
|
+
bool worst_case) {
|
7000
7242
|
const auto & model = lctx.model;
|
7001
7243
|
|
7002
|
-
// check if we should build the worst-case graph (for memory measurement)
|
7003
|
-
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
7004
|
-
|
7005
7244
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7006
7245
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7007
7246
|
if (il >= 0) {
|
@@ -7022,67 +7261,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7022
7261
|
|
7023
7262
|
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
7024
7263
|
|
7025
|
-
//
|
7026
|
-
// set input data
|
7027
|
-
//
|
7028
|
-
|
7029
|
-
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
7030
|
-
if (batch.token) {
|
7031
|
-
const int64_t n_tokens = batch.n_tokens;
|
7032
|
-
|
7033
|
-
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7034
|
-
}
|
7035
|
-
|
7036
|
-
if (batch.embd) {
|
7037
|
-
const int64_t n_embd = llm.n_embd;
|
7038
|
-
const int64_t n_tokens = batch.n_tokens;
|
7039
|
-
|
7040
|
-
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7041
|
-
}
|
7042
|
-
|
7043
|
-
if (batch.pos) {
|
7044
|
-
const int64_t n_tokens = batch.n_tokens;
|
7045
|
-
|
7046
|
-
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7047
|
-
}
|
7048
|
-
|
7049
|
-
{
|
7050
|
-
const int64_t n_kv = llm.n_kv;
|
7051
|
-
const int64_t n_tokens = batch.n_tokens;
|
7052
|
-
|
7053
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7054
|
-
float * data = (float *) lctx.inp_KQ_mask->data;
|
7055
|
-
|
7056
|
-
for (int h = 0; h < 1; ++h) {
|
7057
|
-
for (int j = 0; j < n_tokens; ++j) {
|
7058
|
-
const llama_pos pos = batch.pos[j];
|
7059
|
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7060
|
-
|
7061
|
-
for (int i = 0; i < n_kv; ++i) {
|
7062
|
-
float f;
|
7063
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
7064
|
-
f = -INFINITY;
|
7065
|
-
} else {
|
7066
|
-
f = 0;
|
7067
|
-
}
|
7068
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7069
|
-
}
|
7070
|
-
}
|
7071
|
-
}
|
7072
|
-
}
|
7073
|
-
|
7074
|
-
if (llm.do_rope_shift) {
|
7075
|
-
const int64_t n_ctx = llm.n_ctx;
|
7076
|
-
|
7077
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7078
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7079
|
-
|
7080
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7081
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7082
|
-
}
|
7083
|
-
}
|
7084
|
-
}
|
7085
|
-
|
7086
7264
|
llm.init();
|
7087
7265
|
|
7088
7266
|
switch (model.arch) {
|
@@ -7110,6 +7288,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7110
7288
|
{
|
7111
7289
|
result = llm.build_refact();
|
7112
7290
|
} break;
|
7291
|
+
case LLM_ARCH_BERT:
|
7292
|
+
{
|
7293
|
+
result = llm.build_bert();
|
7294
|
+
} break;
|
7113
7295
|
case LLM_ARCH_BLOOM:
|
7114
7296
|
{
|
7115
7297
|
result = llm.build_bloom();
|
@@ -7167,6 +7349,97 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7167
7349
|
return result;
|
7168
7350
|
}
|
7169
7351
|
|
7352
|
+
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7353
|
+
//
|
7354
|
+
// set input data
|
7355
|
+
//
|
7356
|
+
|
7357
|
+
const auto & hparams = lctx.model.hparams;
|
7358
|
+
const auto & cparams = lctx.cparams;
|
7359
|
+
const auto & kv_self = lctx.kv_self;
|
7360
|
+
|
7361
|
+
if (batch.token) {
|
7362
|
+
const int64_t n_tokens = batch.n_tokens;
|
7363
|
+
|
7364
|
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7365
|
+
}
|
7366
|
+
|
7367
|
+
if (batch.embd) {
|
7368
|
+
const int64_t n_embd = hparams.n_embd;
|
7369
|
+
const int64_t n_tokens = batch.n_tokens;
|
7370
|
+
|
7371
|
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7372
|
+
}
|
7373
|
+
|
7374
|
+
if (batch.pos) {
|
7375
|
+
const int64_t n_tokens = batch.n_tokens;
|
7376
|
+
|
7377
|
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7378
|
+
}
|
7379
|
+
|
7380
|
+
{
|
7381
|
+
const int64_t n_kv = kv_self.n;
|
7382
|
+
const int64_t n_tokens = batch.n_tokens;
|
7383
|
+
|
7384
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7385
|
+
|
7386
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7387
|
+
|
7388
|
+
for (int h = 0; h < 1; ++h) {
|
7389
|
+
for (int j = 0; j < n_tokens; ++j) {
|
7390
|
+
const llama_pos pos = batch.pos[j];
|
7391
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7392
|
+
|
7393
|
+
for (int i = 0; i < n_kv; ++i) {
|
7394
|
+
float f;
|
7395
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
7396
|
+
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
7397
|
+
f = -INFINITY;
|
7398
|
+
} else {
|
7399
|
+
f = 0;
|
7400
|
+
}
|
7401
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7402
|
+
}
|
7403
|
+
}
|
7404
|
+
}
|
7405
|
+
}
|
7406
|
+
|
7407
|
+
{
|
7408
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
7409
|
+
float * data = (float *) lctx.inp_sum->data;
|
7410
|
+
|
7411
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
7412
|
+
data[i] = 1.0f/float(batch.n_tokens);
|
7413
|
+
}
|
7414
|
+
}
|
7415
|
+
|
7416
|
+
if (kv_self.has_shift) {
|
7417
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7418
|
+
|
7419
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7420
|
+
|
7421
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7422
|
+
|
7423
|
+
for (int i = 0; i < n_ctx; ++i) {
|
7424
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
7425
|
+
}
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
if (hparams.pooling_layer && cparams.do_pooling) {
|
7429
|
+
const int64_t n_tokens = batch.n_tokens;
|
7430
|
+
|
7431
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
7432
|
+
float * data = (float *) lctx.inp_sum->data;
|
7433
|
+
|
7434
|
+
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
7435
|
+
|
7436
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7437
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7438
|
+
data[seq_id*n_tokens + i] = 1.0f;
|
7439
|
+
}
|
7440
|
+
}
|
7441
|
+
}
|
7442
|
+
|
7170
7443
|
// decode a batch of tokens by evaluating the transformer
|
7171
7444
|
//
|
7172
7445
|
// - lctx: llama context
|
@@ -7265,17 +7538,22 @@ static int llama_decode_internal(
|
|
7265
7538
|
ggml_backend_sched_reset(lctx.sched);
|
7266
7539
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
7267
7540
|
|
7268
|
-
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
7541
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7269
7542
|
|
7270
7543
|
// the output is always the last tensor in the graph
|
7271
7544
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
7272
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
7273
|
-
|
7274
|
-
// the embeddings could be the second to last tensor, or the third to last tensor
|
7275
7545
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
7276
|
-
if (strcmp(
|
7277
|
-
embeddings
|
7278
|
-
|
7546
|
+
if (strcmp(res->name, "result_output") == 0) {
|
7547
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
7548
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
7549
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
7550
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
7551
|
+
}
|
7552
|
+
} else if (strcmp(res->name, "result_embd") == 0) {
|
7553
|
+
embeddings = res;
|
7554
|
+
res = nullptr;
|
7555
|
+
} else {
|
7556
|
+
GGML_ASSERT(false);
|
7279
7557
|
}
|
7280
7558
|
|
7281
7559
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -7285,7 +7563,9 @@ static int llama_decode_internal(
|
|
7285
7563
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
7286
7564
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
7287
7565
|
// with the BLAS calls. need a better solution
|
7288
|
-
|
7566
|
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
7567
|
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
7568
|
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
7289
7569
|
n_threads = std::min(4, n_threads);
|
7290
7570
|
}
|
7291
7571
|
|
@@ -7303,6 +7583,9 @@ static int llama_decode_internal(
|
|
7303
7583
|
if (lctx.backend_cpu != nullptr) {
|
7304
7584
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7305
7585
|
}
|
7586
|
+
|
7587
|
+
llama_set_inputs(lctx, batch);
|
7588
|
+
|
7306
7589
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
7307
7590
|
|
7308
7591
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
@@ -7342,7 +7625,7 @@ static int llama_decode_internal(
|
|
7342
7625
|
// extract logits
|
7343
7626
|
// TODO: do not compute and extract logits if only embeddings are needed
|
7344
7627
|
// need to update the graphs to skip "result_output"
|
7345
|
-
{
|
7628
|
+
if (res) {
|
7346
7629
|
auto & logits_out = lctx.logits;
|
7347
7630
|
|
7348
7631
|
#ifndef NDEBUG
|
@@ -7386,9 +7669,12 @@ static int llama_decode_internal(
|
|
7386
7669
|
if (!lctx.embedding.empty()) {
|
7387
7670
|
auto & embedding_out = lctx.embedding;
|
7388
7671
|
|
7389
|
-
|
7672
|
+
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
7673
|
+
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
7674
|
+
|
7675
|
+
embedding_out.resize(embd_size);
|
7390
7676
|
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
7391
|
-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(),
|
7677
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
7392
7678
|
ggml_backend_synchronize(embeddings_backend);
|
7393
7679
|
}
|
7394
7680
|
|
@@ -7452,6 +7738,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
7452
7738
|
GGML_ASSERT(false);
|
7453
7739
|
return unicode_to_bytes_bpe(token_data.text);
|
7454
7740
|
}
|
7741
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
7742
|
+
GGML_ASSERT(false);
|
7743
|
+
}
|
7455
7744
|
default:
|
7456
7745
|
GGML_ASSERT(false);
|
7457
7746
|
}
|
@@ -7462,8 +7751,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
7462
7751
|
switch (llama_vocab_get_type(vocab)) {
|
7463
7752
|
case LLAMA_VOCAB_TYPE_SPM: {
|
7464
7753
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
7465
|
-
|
7754
|
+
auto token = vocab.token_to_id.find(buf);
|
7755
|
+
if (token != vocab.token_to_id.end()) {
|
7756
|
+
return (*token).second;
|
7757
|
+
}
|
7758
|
+
// Try to fall back to just the byte as a string
|
7759
|
+
const char buf2[2] = { (char)ch, 0 };
|
7760
|
+
return vocab.token_to_id.at(buf2);
|
7466
7761
|
}
|
7762
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
7467
7763
|
case LLAMA_VOCAB_TYPE_BPE: {
|
7468
7764
|
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
7469
7765
|
}
|
@@ -7509,7 +7805,7 @@ struct llm_bigram_spm {
|
|
7509
7805
|
};
|
7510
7806
|
|
7511
7807
|
struct llm_tokenizer_spm {
|
7512
|
-
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
7808
|
+
llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
|
7513
7809
|
|
7514
7810
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
7515
7811
|
// split string into utf8 chars
|
@@ -7584,6 +7880,7 @@ private:
|
|
7584
7880
|
|
7585
7881
|
if (p == rev_merge.end()) {
|
7586
7882
|
// output any symbols that did not form tokens as bytes.
|
7883
|
+
output.reserve(output.size() + symbol.n);
|
7587
7884
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
7588
7885
|
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
7589
7886
|
output.push_back(token_id);
|
@@ -7934,29 +8231,230 @@ private:
|
|
7934
8231
|
llm_bigram_bpe::queue work_queue;
|
7935
8232
|
};
|
7936
8233
|
|
7937
|
-
|
8234
|
+
struct llm_tokenizer_wpm {
|
8235
|
+
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
8236
|
+
|
8237
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
8238
|
+
auto * token_map = &vocab.token_to_id;
|
8239
|
+
|
8240
|
+
// normalize and split by whitespace
|
8241
|
+
std::vector<std::string> words = preprocess(text);
|
8242
|
+
|
8243
|
+
// bos token prepended already
|
8244
|
+
|
8245
|
+
// find the longest tokens that form the words
|
8246
|
+
for (const std::string &word : words) {
|
8247
|
+
// skip empty words
|
8248
|
+
if (word.size() == 0) {
|
8249
|
+
continue;
|
8250
|
+
}
|
8251
|
+
|
8252
|
+
// prepend phantom space
|
8253
|
+
std::string word1 = "\xe2\x96\x81" + word;
|
8254
|
+
int n = word1.size();
|
8255
|
+
|
8256
|
+
// we're at the start of a new word
|
8257
|
+
int i = 0;
|
8258
|
+
bool match_any = false;
|
8259
|
+
|
8260
|
+
// move through character position in word
|
8261
|
+
while (i < n) {
|
8262
|
+
// loop through possible match length
|
8263
|
+
bool match = false;
|
8264
|
+
for (int j = n; j > i; j--) {
|
8265
|
+
auto it = token_map->find(word1.substr(i, j - i));
|
8266
|
+
if (it != token_map->end()) {
|
8267
|
+
output.push_back(it->second);
|
8268
|
+
match = true;
|
8269
|
+
match_any = true;
|
8270
|
+
i = j;
|
8271
|
+
break;
|
8272
|
+
}
|
8273
|
+
}
|
8274
|
+
|
8275
|
+
// must be an unknown character
|
8276
|
+
if (!match) {
|
8277
|
+
i++;
|
8278
|
+
}
|
8279
|
+
}
|
8280
|
+
|
8281
|
+
// we didn't find any matches for this word
|
8282
|
+
if (!match_any) {
|
8283
|
+
output.push_back(vocab.special_unk_id);
|
8284
|
+
}
|
8285
|
+
}
|
8286
|
+
|
8287
|
+
// append eos token
|
8288
|
+
output.push_back(vocab.special_eos_id);
|
8289
|
+
}
|
8290
|
+
|
8291
|
+
std::vector<std::string> preprocess(const std::string & text) {
|
8292
|
+
std::string ori_str = normalize(text);
|
8293
|
+
uint64_t ori_size = ori_str.size();
|
8294
|
+
|
8295
|
+
// single punct / single symbol / single digit
|
8296
|
+
// baseline: add whitespace on the left and right of punct and chinese characters
|
8297
|
+
std::vector<std::string> words;
|
8298
|
+
std::string new_str = "";
|
8299
|
+
uint64_t i = 0;
|
8300
|
+
while (i < ori_size) {
|
8301
|
+
int utf_char_len = utf8_len(ori_str[i]);
|
8302
|
+
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
|
8303
|
+
new_str += " ";
|
8304
|
+
new_str += ori_str[i];
|
8305
|
+
new_str += " ";
|
8306
|
+
i += 1;
|
8307
|
+
}
|
8308
|
+
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
|
8309
|
+
new_str += " ";
|
8310
|
+
new_str += ori_str.substr(i, 3);
|
8311
|
+
new_str += " ";
|
8312
|
+
i += 3;
|
8313
|
+
}
|
8314
|
+
else {
|
8315
|
+
new_str += ori_str[i];
|
8316
|
+
i += 1;
|
8317
|
+
}
|
8318
|
+
}
|
8319
|
+
|
8320
|
+
// split by whitespace
|
8321
|
+
uint64_t l = 0;
|
8322
|
+
uint64_t r = 0;
|
8323
|
+
while (r < new_str.size()) {
|
8324
|
+
// if is whitespace
|
8325
|
+
if (isspace(new_str[r])) {
|
8326
|
+
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
8327
|
+
l = r + 1;
|
8328
|
+
r = l;
|
8329
|
+
}
|
8330
|
+
else {
|
8331
|
+
r += 1;
|
8332
|
+
}
|
8333
|
+
}
|
8334
|
+
if (r > l) {
|
8335
|
+
words.push_back(new_str.substr(l, (r - l)));
|
8336
|
+
}
|
8337
|
+
return words;
|
8338
|
+
}
|
8339
|
+
|
8340
|
+
std::string normalize(const std::string & text) {
|
8341
|
+
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
8342
|
+
std::string text2 = strip_accents(text);
|
8343
|
+
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
8344
|
+
char c = text2[i];
|
8345
|
+
if (c >= 'A' && c <= 'Z') {
|
8346
|
+
text2[i] = c - 'A' + 'a';
|
8347
|
+
}
|
8348
|
+
}
|
8349
|
+
return text2;
|
8350
|
+
}
|
8351
|
+
|
8352
|
+
bool is_chinese_char(const std::string & str) {
|
8353
|
+
int len = str.length();
|
8354
|
+
unsigned int codepoint = 0;
|
8355
|
+
int num_bytes = 0;
|
8356
|
+
int i = 0;
|
8357
|
+
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8358
|
+
if (ch <= 0x7f) {
|
8359
|
+
codepoint = ch;
|
8360
|
+
num_bytes = 1;
|
8361
|
+
} else if ((ch >> 5) == 0x06) {
|
8362
|
+
codepoint = ch & 0x1f;
|
8363
|
+
num_bytes = 2;
|
8364
|
+
} else if ((ch >> 4) == 0x0e) {
|
8365
|
+
codepoint = ch & 0x0f;
|
8366
|
+
num_bytes = 3;
|
8367
|
+
} else if ((ch >> 3) == 0x1e) {
|
8368
|
+
codepoint = ch & 0x07;
|
8369
|
+
num_bytes = 4;
|
8370
|
+
}
|
8371
|
+
for (int j = 1; j < num_bytes; ++j) {
|
8372
|
+
if (i + j >= len) {
|
8373
|
+
return false; // incomplete UTF-8 character
|
8374
|
+
}
|
8375
|
+
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8376
|
+
if ((next_ch >> 6) != 0x02) {
|
8377
|
+
return false; // invalid trailing byte
|
8378
|
+
}
|
8379
|
+
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8380
|
+
}
|
8381
|
+
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8382
|
+
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8383
|
+
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
8384
|
+
(codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
|
8385
|
+
(codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
|
8386
|
+
(codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
8387
|
+
(codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
|
8388
|
+
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
|
8389
|
+
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
|
8390
|
+
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
|
8391
|
+
return true; // NOLINT
|
8392
|
+
}
|
8393
|
+
return false;
|
8394
|
+
}
|
8395
|
+
|
8396
|
+
std::string strip_accents(const std::string & input_string) {
|
8397
|
+
std::string resultString;
|
8398
|
+
std::map<std::string, char> accent_map = {
|
8399
|
+
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8400
|
+
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8401
|
+
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8402
|
+
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8403
|
+
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8404
|
+
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8405
|
+
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8406
|
+
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8407
|
+
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8408
|
+
};
|
8409
|
+
|
8410
|
+
for (size_t i = 0; i < input_string.length();) {
|
8411
|
+
int len = utf8_len(input_string[i]);
|
8412
|
+
std::string curChar = input_string.substr(i, len);
|
8413
|
+
auto iter = accent_map.find(curChar);
|
8414
|
+
if (iter != accent_map.end()) {
|
8415
|
+
resultString += iter->second;
|
8416
|
+
} else {
|
8417
|
+
resultString += curChar;
|
8418
|
+
}
|
8419
|
+
i += len;
|
8420
|
+
}
|
8421
|
+
|
8422
|
+
return resultString;
|
8423
|
+
}
|
8424
|
+
|
8425
|
+
static size_t utf8_len(char src) {
|
8426
|
+
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8427
|
+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8428
|
+
return lookup[highbits];
|
8429
|
+
}
|
8430
|
+
|
8431
|
+
const llama_vocab & vocab;
|
8432
|
+
};
|
8433
|
+
|
8434
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
7938
8435
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
7939
8436
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
7940
8437
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
7941
8438
|
|
7942
|
-
struct fragment_buffer_variant{
|
8439
|
+
struct fragment_buffer_variant {
|
7943
8440
|
fragment_buffer_variant(llama_vocab::id _token)
|
7944
8441
|
:
|
7945
8442
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
7946
8443
|
token(_token),
|
7947
8444
|
raw_text(_dummy),
|
7948
8445
|
offset(0),
|
7949
|
-
length(0){}
|
8446
|
+
length(0) {}
|
8447
|
+
|
7950
8448
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
7951
8449
|
:
|
7952
8450
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
7953
|
-
token((llama_vocab::id)-1),
|
8451
|
+
token((llama_vocab::id) - 1),
|
7954
8452
|
raw_text(_raw_text),
|
7955
8453
|
offset(_offset),
|
7956
8454
|
length(_length){
|
7957
|
-
GGML_ASSERT(
|
7958
|
-
GGML_ASSERT(
|
7959
|
-
GGML_ASSERT(
|
8455
|
+
GGML_ASSERT(_offset >= 0);
|
8456
|
+
GGML_ASSERT(_length >= 1);
|
8457
|
+
GGML_ASSERT(offset + length <= raw_text.length());
|
7960
8458
|
}
|
7961
8459
|
|
7962
8460
|
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
@@ -7969,8 +8467,7 @@ struct fragment_buffer_variant{
|
|
7969
8467
|
|
7970
8468
|
// #define PRETOKENIZERDEBUG
|
7971
8469
|
|
7972
|
-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
7973
|
-
{
|
8470
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
7974
8471
|
// for each special token
|
7975
8472
|
for (const auto & st: vocab.special_tokens_cache) {
|
7976
8473
|
const auto & special_token = st.first;
|
@@ -8081,17 +8578,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8081
8578
|
}
|
8082
8579
|
|
8083
8580
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
8084
|
-
fragment_buffer.emplace_front(
|
8581
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
8085
8582
|
|
8086
|
-
if (special) tokenizer_st_partition(
|
8583
|
+
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
8087
8584
|
|
8088
8585
|
switch (vocab.type) {
|
8089
8586
|
case LLAMA_VOCAB_TYPE_SPM:
|
8090
8587
|
{
|
8091
|
-
for (const auto & fragment: fragment_buffer)
|
8092
|
-
|
8093
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
8094
|
-
{
|
8588
|
+
for (const auto & fragment : fragment_buffer) {
|
8589
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8095
8590
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
8096
8591
|
|
8097
8592
|
// TODO: It's likely possible to get rid of this string copy entirely
|
@@ -8111,19 +8606,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8111
8606
|
llm_tokenizer_spm tokenizer(vocab);
|
8112
8607
|
llama_escape_whitespace(raw_text);
|
8113
8608
|
tokenizer.tokenize(raw_text, output);
|
8114
|
-
}
|
8115
|
-
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8116
|
-
{
|
8609
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8117
8610
|
output.push_back(fragment.token);
|
8118
8611
|
}
|
8119
8612
|
}
|
8120
8613
|
} break;
|
8121
8614
|
case LLAMA_VOCAB_TYPE_BPE:
|
8122
8615
|
{
|
8123
|
-
for (const auto & fragment: fragment_buffer)
|
8124
|
-
|
8125
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
8126
|
-
{
|
8616
|
+
for (const auto & fragment : fragment_buffer) {
|
8617
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8127
8618
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8128
8619
|
|
8129
8620
|
#ifdef PRETOKENIZERDEBUG
|
@@ -8131,9 +8622,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8131
8622
|
#endif
|
8132
8623
|
llm_tokenizer_bpe tokenizer(vocab);
|
8133
8624
|
tokenizer.tokenize(raw_text, output);
|
8625
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8626
|
+
output.push_back(fragment.token);
|
8134
8627
|
}
|
8135
|
-
|
8136
|
-
|
8628
|
+
}
|
8629
|
+
} break;
|
8630
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
8631
|
+
{
|
8632
|
+
for (const auto & fragment : fragment_buffer) {
|
8633
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8634
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8635
|
+
|
8636
|
+
#ifdef PRETOKENIZERDEBUG
|
8637
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
8638
|
+
#endif
|
8639
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
8640
|
+
tokenizer.tokenize(raw_text, output);
|
8641
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8137
8642
|
output.push_back(fragment.token);
|
8138
8643
|
}
|
8139
8644
|
}
|
@@ -9785,6 +10290,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9785
10290
|
}
|
9786
10291
|
++qs.i_ffn_up;
|
9787
10292
|
}
|
10293
|
+
|
9788
10294
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
9789
10295
|
//}
|
9790
10296
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
@@ -9844,19 +10350,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9844
10350
|
|
9845
10351
|
// K-quants
|
9846
10352
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
9847
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
10353
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
9848
10354
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
9849
10355
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
9850
10356
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
9851
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
10357
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
9852
10358
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
9853
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
10359
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
9854
10360
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
9855
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
9856
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
9857
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9858
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS
|
9859
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10361
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
10362
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10363
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10364
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10365
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
9860
10366
|
|
9861
10367
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9862
10368
|
}
|
@@ -9986,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9986
10492
|
quantize &= !params->only_copy;
|
9987
10493
|
|
9988
10494
|
// do not quantize expert gating tensors
|
9989
|
-
quantize &= name.
|
10495
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
10496
|
+
|
10497
|
+
// do not quantize positional embeddings and token types (BERT)
|
10498
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
10499
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
9990
10500
|
|
9991
10501
|
enum ggml_type new_type;
|
9992
10502
|
void * new_data;
|
@@ -10488,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
|
|
10488
10998
|
/*.logits_all =*/ false,
|
10489
10999
|
/*.embedding =*/ false,
|
10490
11000
|
/*.offload_kqv =*/ true,
|
11001
|
+
/*.do_pooling =*/ true,
|
10491
11002
|
};
|
10492
11003
|
|
10493
11004
|
return result;
|
@@ -10643,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
|
|
10643
11154
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
10644
11155
|
cparams.mul_mat_q = params.mul_mat_q;
|
10645
11156
|
cparams.offload_kqv = params.offload_kqv;
|
11157
|
+
cparams.do_pooling = params.do_pooling;
|
10646
11158
|
|
10647
11159
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
10648
11160
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -10790,14 +11302,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10790
11302
|
// resized during inference, reserve maximum
|
10791
11303
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
10792
11304
|
|
10793
|
-
if (params.embedding){
|
11305
|
+
if (params.embedding) {
|
10794
11306
|
ctx->embedding.resize(hparams.n_embd);
|
10795
11307
|
}
|
10796
11308
|
|
10797
11309
|
// graph inputs
|
10798
11310
|
{
|
10799
11311
|
ggml_init_params init_params = {
|
10800
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
11312
|
+
/* .mem_size */ ggml_tensor_overhead()*7,
|
10801
11313
|
/* .mem_buffer */ nullptr,
|
10802
11314
|
/* .no_alloc */ true,
|
10803
11315
|
};
|
@@ -10808,12 +11320,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10808
11320
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10809
11321
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
10810
11322
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
11323
|
+
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
10811
11324
|
|
10812
11325
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
10813
11326
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
10814
11327
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
10815
11328
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
10816
11329
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11330
|
+
ggml_set_name(ctx->inp_sum, "inp_sum");
|
10817
11331
|
|
10818
11332
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
10819
11333
|
|
@@ -10839,23 +11353,27 @@ struct llama_context * llama_new_context_with_model(
|
|
10839
11353
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
10840
11354
|
|
10841
11355
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
10842
|
-
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
10843
11356
|
|
10844
11357
|
// build worst-case graph
|
10845
11358
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
10846
11359
|
int n_past = cparams.n_ctx - n_tokens;
|
10847
11360
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
10848
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
11361
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
10849
11362
|
|
10850
11363
|
// initialize scheduler with the worst-case graph
|
10851
|
-
|
10852
|
-
|
11364
|
+
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
|
11365
|
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
11366
|
+
llama_free(ctx);
|
11367
|
+
return nullptr;
|
11368
|
+
}
|
10853
11369
|
|
10854
|
-
for (
|
10855
|
-
|
11370
|
+
for (size_t i = 0; i < ctx->backends.size(); i++) {
|
11371
|
+
ggml_backend_t backend = ctx->backends[i];
|
11372
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
11373
|
+
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
10856
11374
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
10857
|
-
|
10858
|
-
|
11375
|
+
ggml_backend_buft_name(buft),
|
11376
|
+
size / 1024.0 / 1024.0);
|
10859
11377
|
}
|
10860
11378
|
|
10861
11379
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
@@ -11660,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
11660
12178
|
return ctx->embedding.data();
|
11661
12179
|
}
|
11662
12180
|
|
12181
|
+
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12182
|
+
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
12183
|
+
}
|
12184
|
+
|
11663
12185
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
11664
12186
|
return model->vocab.id_to_token[token].text.c_str();
|
11665
12187
|
}
|
@@ -11744,6 +12266,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
11744
12266
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
11745
12267
|
if (0 <= token && token < llama_n_vocab(model)) {
|
11746
12268
|
switch (llama_vocab_get_type(model->vocab)) {
|
12269
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
11747
12270
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11748
12271
|
// NOTE: we accept all unsupported token types,
|
11749
12272
|
// suppressing them like CONTROL tokens.
|
@@ -11867,6 +12390,7 @@ const char * llama_print_system_info(void) {
|
|
11867
12390
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
11868
12391
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
11869
12392
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
12393
|
+
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
11870
12394
|
|
11871
12395
|
return s.c_str();
|
11872
12396
|
}
|