llama_cpp 0.12.5 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -196,6 +196,7 @@ enum llm_arch {
|
|
196
196
|
LLM_ARCH_STARCODER,
|
197
197
|
LLM_ARCH_PERSIMMON,
|
198
198
|
LLM_ARCH_REFACT,
|
199
|
+
LLM_ARCH_BERT,
|
199
200
|
LLM_ARCH_BLOOM,
|
200
201
|
LLM_ARCH_STABLELM,
|
201
202
|
LLM_ARCH_QWEN,
|
@@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
220
221
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
221
222
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
222
223
|
{ LLM_ARCH_REFACT, "refact" },
|
224
|
+
{ LLM_ARCH_BERT, "bert" },
|
223
225
|
{ LLM_ARCH_BLOOM, "bloom" },
|
224
226
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
225
227
|
{ LLM_ARCH_QWEN, "qwen" },
|
@@ -252,6 +254,7 @@ enum llm_kv {
|
|
252
254
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
253
255
|
LLM_KV_EXPERT_COUNT,
|
254
256
|
LLM_KV_EXPERT_USED_COUNT,
|
257
|
+
LLM_KV_POOLING_LAYER,
|
255
258
|
|
256
259
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
257
260
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -261,6 +264,7 @@ enum llm_kv {
|
|
261
264
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
262
265
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
263
266
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
267
|
+
LLM_KV_ATTENTION_CAUSAL,
|
264
268
|
|
265
269
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
266
270
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -273,6 +277,7 @@ enum llm_kv {
|
|
273
277
|
LLM_KV_TOKENIZER_MODEL,
|
274
278
|
LLM_KV_TOKENIZER_LIST,
|
275
279
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
280
|
+
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
276
281
|
LLM_KV_TOKENIZER_SCORES,
|
277
282
|
LLM_KV_TOKENIZER_MERGES,
|
278
283
|
LLM_KV_TOKENIZER_BOS_ID,
|
@@ -307,6 +312,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
307
312
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
308
313
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
309
314
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
315
|
+
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
|
310
316
|
|
311
317
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
312
318
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -316,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
316
322
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
317
323
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
318
324
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
325
|
+
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
319
326
|
|
320
327
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
321
328
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -328,6 +335,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
328
335
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
329
336
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
330
337
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
338
|
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
331
339
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
332
340
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
333
341
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
@@ -355,6 +363,7 @@ struct LLM_KV {
|
|
355
363
|
enum llm_tensor {
|
356
364
|
LLM_TENSOR_TOKEN_EMBD,
|
357
365
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
366
|
+
LLM_TENSOR_TOKEN_TYPES,
|
358
367
|
LLM_TENSOR_POS_EMBD,
|
359
368
|
LLM_TENSOR_OUTPUT,
|
360
369
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -536,6 +545,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
536
545
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
537
546
|
},
|
538
547
|
},
|
548
|
+
{
|
549
|
+
LLM_ARCH_BERT,
|
550
|
+
{
|
551
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
552
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
553
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
554
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
555
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
|
556
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
557
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
558
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
559
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
560
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
|
561
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
562
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
563
|
+
},
|
564
|
+
},
|
539
565
|
{
|
540
566
|
LLM_ARCH_BLOOM,
|
541
567
|
{
|
@@ -748,22 +774,37 @@ struct LLM_TN {
|
|
748
774
|
llm_arch arch;
|
749
775
|
|
750
776
|
std::string operator()(llm_tensor tensor) const {
|
777
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
778
|
+
return "__missing__";
|
779
|
+
}
|
751
780
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
752
781
|
}
|
753
782
|
|
754
783
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
784
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
785
|
+
return "__missing__";
|
786
|
+
}
|
755
787
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
756
788
|
}
|
757
789
|
|
758
790
|
std::string operator()(llm_tensor tensor, int bid) const {
|
791
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
792
|
+
return "__missing__";
|
793
|
+
}
|
759
794
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
760
795
|
}
|
761
796
|
|
762
797
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
798
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
799
|
+
return "__missing__";
|
800
|
+
}
|
763
801
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
764
802
|
}
|
765
803
|
|
766
804
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
805
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
806
|
+
return "__missing__";
|
807
|
+
}
|
767
808
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
768
809
|
}
|
769
810
|
};
|
@@ -1440,6 +1481,11 @@ static llama_state g_state;
|
|
1440
1481
|
// available llama models
|
1441
1482
|
enum e_model {
|
1442
1483
|
MODEL_UNKNOWN,
|
1484
|
+
MODEL_17M,
|
1485
|
+
MODEL_22M,
|
1486
|
+
MODEL_33M,
|
1487
|
+
MODEL_109M,
|
1488
|
+
MODEL_335M,
|
1443
1489
|
MODEL_0_5B,
|
1444
1490
|
MODEL_1B,
|
1445
1491
|
MODEL_2B,
|
@@ -1481,6 +1527,7 @@ struct llama_hparams {
|
|
1481
1527
|
uint32_t n_ff;
|
1482
1528
|
uint32_t n_expert = 0;
|
1483
1529
|
uint32_t n_expert_used = 0;
|
1530
|
+
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1484
1531
|
|
1485
1532
|
float f_norm_eps;
|
1486
1533
|
float f_norm_rms_eps;
|
@@ -1493,6 +1540,9 @@ struct llama_hparams {
|
|
1493
1540
|
float f_clamp_kqv;
|
1494
1541
|
float f_max_alibi_bias;
|
1495
1542
|
|
1543
|
+
bool causal_attn = true;
|
1544
|
+
bool pooling_layer = false;
|
1545
|
+
|
1496
1546
|
|
1497
1547
|
bool operator!=(const llama_hparams & other) const {
|
1498
1548
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1554,6 +1604,7 @@ struct llama_cparams {
|
|
1554
1604
|
|
1555
1605
|
bool mul_mat_q;
|
1556
1606
|
bool offload_kqv;
|
1607
|
+
bool do_pooling;
|
1557
1608
|
|
1558
1609
|
ggml_backend_sched_eval_callback cb_eval;
|
1559
1610
|
void * cb_eval_user_data;
|
@@ -1720,6 +1771,7 @@ struct llama_model {
|
|
1720
1771
|
llama_vocab vocab;
|
1721
1772
|
|
1722
1773
|
struct ggml_tensor * tok_embd;
|
1774
|
+
struct ggml_tensor * type_embd;
|
1723
1775
|
struct ggml_tensor * pos_embd;
|
1724
1776
|
struct ggml_tensor * tok_norm;
|
1725
1777
|
struct ggml_tensor * tok_norm_b;
|
@@ -1839,8 +1891,6 @@ struct llama_context {
|
|
1839
1891
|
// memory buffers used to evaluate the model
|
1840
1892
|
std::vector<uint8_t> buf_compute_meta;
|
1841
1893
|
ggml_backend_sched_t sched = nullptr;
|
1842
|
-
// allocator for the input tensors
|
1843
|
-
ggml_tallocr * alloc = nullptr;
|
1844
1894
|
|
1845
1895
|
// input tensors
|
1846
1896
|
ggml_backend_buffer_t buf_input = nullptr;
|
@@ -1850,6 +1900,7 @@ struct llama_context {
|
|
1850
1900
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1851
1901
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1852
1902
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1903
|
+
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
|
1853
1904
|
|
1854
1905
|
#ifdef GGML_USE_MPI
|
1855
1906
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2829,6 +2880,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
2829
2880
|
switch (type) {
|
2830
2881
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2831
2882
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2883
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
2832
2884
|
default: return "unknown";
|
2833
2885
|
}
|
2834
2886
|
}
|
@@ -3000,6 +3052,27 @@ static void llm_load_hparams(
|
|
3000
3052
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3001
3053
|
}
|
3002
3054
|
} break;
|
3055
|
+
case LLM_ARCH_BERT:
|
3056
|
+
{
|
3057
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3058
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3059
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3060
|
+
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
3061
|
+
|
3062
|
+
switch (hparams.n_layer) {
|
3063
|
+
case 3:
|
3064
|
+
model.type = e_model::MODEL_17M; break; // bge-micro
|
3065
|
+
case 6:
|
3066
|
+
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
3067
|
+
case 12:
|
3068
|
+
switch (hparams.n_embd) {
|
3069
|
+
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
3070
|
+
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
3071
|
+
} break;
|
3072
|
+
case 24:
|
3073
|
+
model.type = e_model::MODEL_335M; break; // bge-large
|
3074
|
+
}
|
3075
|
+
} break;
|
3003
3076
|
case LLM_ARCH_BLOOM:
|
3004
3077
|
{
|
3005
3078
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3204,6 +3277,16 @@ static void llm_load_vocab(
|
|
3204
3277
|
vocab.special_unk_id = -1;
|
3205
3278
|
vocab.special_sep_id = -1;
|
3206
3279
|
vocab.special_pad_id = -1;
|
3280
|
+
} else if (tokenizer_name == "bert") {
|
3281
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
3282
|
+
|
3283
|
+
// default special tokens
|
3284
|
+
vocab.special_bos_id = 101;
|
3285
|
+
vocab.special_eos_id = 102;
|
3286
|
+
vocab.special_unk_id = 100;
|
3287
|
+
vocab.special_sep_id = -1;
|
3288
|
+
vocab.special_pad_id = -1;
|
3289
|
+
vocab.add_space_prefix = false;
|
3207
3290
|
} else {
|
3208
3291
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
3209
3292
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
@@ -3231,7 +3314,14 @@ static void llm_load_vocab(
|
|
3231
3314
|
|
3232
3315
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
3233
3316
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
3234
|
-
|
3317
|
+
try {
|
3318
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
3319
|
+
} catch (const std::exception & e) {
|
3320
|
+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
3321
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3322
|
+
}
|
3323
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3324
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3235
3325
|
} else {
|
3236
3326
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
3237
3327
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
@@ -3569,6 +3659,7 @@ static bool llm_load_tensors(
|
|
3569
3659
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3570
3660
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3571
3661
|
const int64_t n_vocab = hparams.n_vocab;
|
3662
|
+
const int64_t n_vocab_type = hparams.n_vocab_type;
|
3572
3663
|
const int64_t n_ff = hparams.n_ff;
|
3573
3664
|
|
3574
3665
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
@@ -3783,11 +3874,50 @@ static bool llm_load_tensors(
|
|
3783
3874
|
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
3784
3875
|
}
|
3785
3876
|
} break;
|
3786
|
-
case
|
3877
|
+
case LLM_ARCH_BERT:
|
3787
3878
|
{
|
3788
3879
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3789
|
-
model.
|
3790
|
-
model.
|
3880
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
3881
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3882
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3883
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3884
|
+
|
3885
|
+
for (int i = 0; i < n_layer; ++i) {
|
3886
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3887
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3888
|
+
|
3889
|
+
auto & layer = model.layers[i];
|
3890
|
+
|
3891
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3892
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3893
|
+
|
3894
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3895
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3896
|
+
|
3897
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3898
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3899
|
+
|
3900
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3901
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3902
|
+
|
3903
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3904
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3905
|
+
|
3906
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3907
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3908
|
+
|
3909
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3910
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3911
|
+
|
3912
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3913
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3914
|
+
}
|
3915
|
+
} break;
|
3916
|
+
case LLM_ARCH_BLOOM:
|
3917
|
+
{
|
3918
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3919
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3920
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3791
3921
|
|
3792
3922
|
// output
|
3793
3923
|
{
|
@@ -4259,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4259
4389
|
|
4260
4390
|
model.hparams.vocab_only = params.vocab_only;
|
4261
4391
|
|
4262
|
-
|
4263
|
-
|
4264
|
-
|
4392
|
+
try {
|
4393
|
+
llm_load_arch(ml, model);
|
4394
|
+
} catch(const std::exception & e) {
|
4395
|
+
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
4396
|
+
}
|
4397
|
+
try {
|
4398
|
+
llm_load_hparams(ml, model);
|
4399
|
+
} catch(const std::exception & e) {
|
4400
|
+
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
4401
|
+
}
|
4402
|
+
try {
|
4403
|
+
llm_load_vocab(ml, model);
|
4404
|
+
} catch(const std::exception & e) {
|
4405
|
+
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
4406
|
+
}
|
4265
4407
|
|
4266
4408
|
llm_load_print_meta(ml, model);
|
4267
4409
|
|
@@ -4739,6 +4881,7 @@ struct llm_build_context {
|
|
4739
4881
|
const int32_t n_orig_ctx;
|
4740
4882
|
|
4741
4883
|
const bool do_rope_shift;
|
4884
|
+
const bool do_pooling;
|
4742
4885
|
|
4743
4886
|
const llm_build_cb & cb;
|
4744
4887
|
|
@@ -4782,6 +4925,7 @@ struct llm_build_context {
|
|
4782
4925
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4783
4926
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4784
4927
|
do_rope_shift (worst_case || kv_self.has_shift),
|
4928
|
+
do_pooling (hparams.pooling_layer && cparams.do_pooling),
|
4785
4929
|
cb (cb),
|
4786
4930
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4787
4931
|
// all initializations should be done in init()
|
@@ -5625,6 +5769,103 @@ struct llm_build_context {
|
|
5625
5769
|
return gf;
|
5626
5770
|
}
|
5627
5771
|
|
5772
|
+
struct ggml_cgraph * build_bert() {
|
5773
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5774
|
+
|
5775
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5776
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5777
|
+
|
5778
|
+
struct ggml_tensor * cur;
|
5779
|
+
struct ggml_tensor * inpL;
|
5780
|
+
|
5781
|
+
// get input vectors with right size
|
5782
|
+
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5783
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5784
|
+
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
|
5785
|
+
|
5786
|
+
// construct input embeddings (token, type, position)
|
5787
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5788
|
+
|
5789
|
+
// token types are hardcoded to zero ("Sentence A")
|
5790
|
+
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
5791
|
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
5792
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
5793
|
+
cb(inpL, "inp_embd", -1);
|
5794
|
+
|
5795
|
+
// embed layer norm
|
5796
|
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
5797
|
+
cb(inpL, "inp_norm", -1);
|
5798
|
+
|
5799
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5800
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5801
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
5802
|
+
|
5803
|
+
// iterate layers
|
5804
|
+
for (int il = 0; il < n_layer; ++il) {
|
5805
|
+
struct ggml_tensor * cur = inpL;
|
5806
|
+
|
5807
|
+
// self-attention
|
5808
|
+
{
|
5809
|
+
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5810
|
+
cb(Qcur, "Qcur", il);
|
5811
|
+
|
5812
|
+
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
5813
|
+
cb(Kcur, "Kcur", il);
|
5814
|
+
|
5815
|
+
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
5816
|
+
cb(Vcur, "Vcur", il);
|
5817
|
+
|
5818
|
+
// seems like we just need to do this for Q?
|
5819
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5820
|
+
|
5821
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5822
|
+
model.layers[il].wo, model.layers[il].bo,
|
5823
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5824
|
+
cb(cur, "kqv_out", il);
|
5825
|
+
}
|
5826
|
+
|
5827
|
+
// re-add the layer input
|
5828
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5829
|
+
|
5830
|
+
// attention layer norm
|
5831
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
|
5832
|
+
|
5833
|
+
struct ggml_tensor * ffn_inp = cur;
|
5834
|
+
cb(ffn_inp, "ffn_inp", il);
|
5835
|
+
|
5836
|
+
// feed-forward network
|
5837
|
+
cur = llm_build_ffn(ctx0, cur,
|
5838
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5839
|
+
NULL, NULL,
|
5840
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5841
|
+
NULL,
|
5842
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5843
|
+
cb(cur, "ffn_out", il);
|
5844
|
+
|
5845
|
+
// attentions bypass the intermediate layer
|
5846
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5847
|
+
|
5848
|
+
// output layer norm
|
5849
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
|
5850
|
+
|
5851
|
+
// input for next layer
|
5852
|
+
inpL = cur;
|
5853
|
+
}
|
5854
|
+
|
5855
|
+
// final output
|
5856
|
+
cur = inpL;
|
5857
|
+
|
5858
|
+
// pooling layer
|
5859
|
+
if (do_pooling) {
|
5860
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
|
5861
|
+
}
|
5862
|
+
cb(cur, "result_embd", -1);
|
5863
|
+
|
5864
|
+
ggml_build_forward_expand(gf, cur);
|
5865
|
+
|
5866
|
+
return gf;
|
5867
|
+
}
|
5868
|
+
|
5628
5869
|
struct ggml_cgraph * build_bloom() {
|
5629
5870
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5630
5871
|
|
@@ -6996,12 +7237,10 @@ struct llm_build_context {
|
|
6996
7237
|
|
6997
7238
|
static struct ggml_cgraph * llama_build_graph(
|
6998
7239
|
llama_context & lctx,
|
6999
|
-
const llama_batch & batch
|
7240
|
+
const llama_batch & batch,
|
7241
|
+
bool worst_case) {
|
7000
7242
|
const auto & model = lctx.model;
|
7001
7243
|
|
7002
|
-
// check if we should build the worst-case graph (for memory measurement)
|
7003
|
-
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
7004
|
-
|
7005
7244
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7006
7245
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7007
7246
|
if (il >= 0) {
|
@@ -7022,67 +7261,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7022
7261
|
|
7023
7262
|
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
7024
7263
|
|
7025
|
-
//
|
7026
|
-
// set input data
|
7027
|
-
//
|
7028
|
-
|
7029
|
-
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
7030
|
-
if (batch.token) {
|
7031
|
-
const int64_t n_tokens = batch.n_tokens;
|
7032
|
-
|
7033
|
-
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7034
|
-
}
|
7035
|
-
|
7036
|
-
if (batch.embd) {
|
7037
|
-
const int64_t n_embd = llm.n_embd;
|
7038
|
-
const int64_t n_tokens = batch.n_tokens;
|
7039
|
-
|
7040
|
-
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7041
|
-
}
|
7042
|
-
|
7043
|
-
if (batch.pos) {
|
7044
|
-
const int64_t n_tokens = batch.n_tokens;
|
7045
|
-
|
7046
|
-
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7047
|
-
}
|
7048
|
-
|
7049
|
-
{
|
7050
|
-
const int64_t n_kv = llm.n_kv;
|
7051
|
-
const int64_t n_tokens = batch.n_tokens;
|
7052
|
-
|
7053
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7054
|
-
float * data = (float *) lctx.inp_KQ_mask->data;
|
7055
|
-
|
7056
|
-
for (int h = 0; h < 1; ++h) {
|
7057
|
-
for (int j = 0; j < n_tokens; ++j) {
|
7058
|
-
const llama_pos pos = batch.pos[j];
|
7059
|
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7060
|
-
|
7061
|
-
for (int i = 0; i < n_kv; ++i) {
|
7062
|
-
float f;
|
7063
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
7064
|
-
f = -INFINITY;
|
7065
|
-
} else {
|
7066
|
-
f = 0;
|
7067
|
-
}
|
7068
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7069
|
-
}
|
7070
|
-
}
|
7071
|
-
}
|
7072
|
-
}
|
7073
|
-
|
7074
|
-
if (llm.do_rope_shift) {
|
7075
|
-
const int64_t n_ctx = llm.n_ctx;
|
7076
|
-
|
7077
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7078
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7079
|
-
|
7080
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7081
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7082
|
-
}
|
7083
|
-
}
|
7084
|
-
}
|
7085
|
-
|
7086
7264
|
llm.init();
|
7087
7265
|
|
7088
7266
|
switch (model.arch) {
|
@@ -7110,6 +7288,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7110
7288
|
{
|
7111
7289
|
result = llm.build_refact();
|
7112
7290
|
} break;
|
7291
|
+
case LLM_ARCH_BERT:
|
7292
|
+
{
|
7293
|
+
result = llm.build_bert();
|
7294
|
+
} break;
|
7113
7295
|
case LLM_ARCH_BLOOM:
|
7114
7296
|
{
|
7115
7297
|
result = llm.build_bloom();
|
@@ -7167,6 +7349,97 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7167
7349
|
return result;
|
7168
7350
|
}
|
7169
7351
|
|
7352
|
+
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7353
|
+
//
|
7354
|
+
// set input data
|
7355
|
+
//
|
7356
|
+
|
7357
|
+
const auto & hparams = lctx.model.hparams;
|
7358
|
+
const auto & cparams = lctx.cparams;
|
7359
|
+
const auto & kv_self = lctx.kv_self;
|
7360
|
+
|
7361
|
+
if (batch.token) {
|
7362
|
+
const int64_t n_tokens = batch.n_tokens;
|
7363
|
+
|
7364
|
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7365
|
+
}
|
7366
|
+
|
7367
|
+
if (batch.embd) {
|
7368
|
+
const int64_t n_embd = hparams.n_embd;
|
7369
|
+
const int64_t n_tokens = batch.n_tokens;
|
7370
|
+
|
7371
|
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7372
|
+
}
|
7373
|
+
|
7374
|
+
if (batch.pos) {
|
7375
|
+
const int64_t n_tokens = batch.n_tokens;
|
7376
|
+
|
7377
|
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7378
|
+
}
|
7379
|
+
|
7380
|
+
{
|
7381
|
+
const int64_t n_kv = kv_self.n;
|
7382
|
+
const int64_t n_tokens = batch.n_tokens;
|
7383
|
+
|
7384
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7385
|
+
|
7386
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7387
|
+
|
7388
|
+
for (int h = 0; h < 1; ++h) {
|
7389
|
+
for (int j = 0; j < n_tokens; ++j) {
|
7390
|
+
const llama_pos pos = batch.pos[j];
|
7391
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7392
|
+
|
7393
|
+
for (int i = 0; i < n_kv; ++i) {
|
7394
|
+
float f;
|
7395
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
7396
|
+
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
7397
|
+
f = -INFINITY;
|
7398
|
+
} else {
|
7399
|
+
f = 0;
|
7400
|
+
}
|
7401
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7402
|
+
}
|
7403
|
+
}
|
7404
|
+
}
|
7405
|
+
}
|
7406
|
+
|
7407
|
+
{
|
7408
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
7409
|
+
float * data = (float *) lctx.inp_sum->data;
|
7410
|
+
|
7411
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
7412
|
+
data[i] = 1.0f/float(batch.n_tokens);
|
7413
|
+
}
|
7414
|
+
}
|
7415
|
+
|
7416
|
+
if (kv_self.has_shift) {
|
7417
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7418
|
+
|
7419
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7420
|
+
|
7421
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7422
|
+
|
7423
|
+
for (int i = 0; i < n_ctx; ++i) {
|
7424
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
7425
|
+
}
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
if (hparams.pooling_layer && cparams.do_pooling) {
|
7429
|
+
const int64_t n_tokens = batch.n_tokens;
|
7430
|
+
|
7431
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
7432
|
+
float * data = (float *) lctx.inp_sum->data;
|
7433
|
+
|
7434
|
+
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
7435
|
+
|
7436
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7437
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7438
|
+
data[seq_id*n_tokens + i] = 1.0f;
|
7439
|
+
}
|
7440
|
+
}
|
7441
|
+
}
|
7442
|
+
|
7170
7443
|
// decode a batch of tokens by evaluating the transformer
|
7171
7444
|
//
|
7172
7445
|
// - lctx: llama context
|
@@ -7265,17 +7538,22 @@ static int llama_decode_internal(
|
|
7265
7538
|
ggml_backend_sched_reset(lctx.sched);
|
7266
7539
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
7267
7540
|
|
7268
|
-
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
7541
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7269
7542
|
|
7270
7543
|
// the output is always the last tensor in the graph
|
7271
7544
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
7272
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
7273
|
-
|
7274
|
-
// the embeddings could be the second to last tensor, or the third to last tensor
|
7275
7545
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
7276
|
-
if (strcmp(
|
7277
|
-
embeddings
|
7278
|
-
|
7546
|
+
if (strcmp(res->name, "result_output") == 0) {
|
7547
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
7548
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
7549
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
7550
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
7551
|
+
}
|
7552
|
+
} else if (strcmp(res->name, "result_embd") == 0) {
|
7553
|
+
embeddings = res;
|
7554
|
+
res = nullptr;
|
7555
|
+
} else {
|
7556
|
+
GGML_ASSERT(false);
|
7279
7557
|
}
|
7280
7558
|
|
7281
7559
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -7285,7 +7563,9 @@ static int llama_decode_internal(
|
|
7285
7563
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
7286
7564
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
7287
7565
|
// with the BLAS calls. need a better solution
|
7288
|
-
|
7566
|
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
7567
|
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
7568
|
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
7289
7569
|
n_threads = std::min(4, n_threads);
|
7290
7570
|
}
|
7291
7571
|
|
@@ -7303,6 +7583,9 @@ static int llama_decode_internal(
|
|
7303
7583
|
if (lctx.backend_cpu != nullptr) {
|
7304
7584
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7305
7585
|
}
|
7586
|
+
|
7587
|
+
llama_set_inputs(lctx, batch);
|
7588
|
+
|
7306
7589
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
7307
7590
|
|
7308
7591
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
@@ -7342,7 +7625,7 @@ static int llama_decode_internal(
|
|
7342
7625
|
// extract logits
|
7343
7626
|
// TODO: do not compute and extract logits if only embeddings are needed
|
7344
7627
|
// need to update the graphs to skip "result_output"
|
7345
|
-
{
|
7628
|
+
if (res) {
|
7346
7629
|
auto & logits_out = lctx.logits;
|
7347
7630
|
|
7348
7631
|
#ifndef NDEBUG
|
@@ -7386,9 +7669,12 @@ static int llama_decode_internal(
|
|
7386
7669
|
if (!lctx.embedding.empty()) {
|
7387
7670
|
auto & embedding_out = lctx.embedding;
|
7388
7671
|
|
7389
|
-
|
7672
|
+
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
7673
|
+
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
7674
|
+
|
7675
|
+
embedding_out.resize(embd_size);
|
7390
7676
|
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
7391
|
-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(),
|
7677
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
7392
7678
|
ggml_backend_synchronize(embeddings_backend);
|
7393
7679
|
}
|
7394
7680
|
|
@@ -7452,6 +7738,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
7452
7738
|
GGML_ASSERT(false);
|
7453
7739
|
return unicode_to_bytes_bpe(token_data.text);
|
7454
7740
|
}
|
7741
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
7742
|
+
GGML_ASSERT(false);
|
7743
|
+
}
|
7455
7744
|
default:
|
7456
7745
|
GGML_ASSERT(false);
|
7457
7746
|
}
|
@@ -7462,8 +7751,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
7462
7751
|
switch (llama_vocab_get_type(vocab)) {
|
7463
7752
|
case LLAMA_VOCAB_TYPE_SPM: {
|
7464
7753
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
7465
|
-
|
7754
|
+
auto token = vocab.token_to_id.find(buf);
|
7755
|
+
if (token != vocab.token_to_id.end()) {
|
7756
|
+
return (*token).second;
|
7757
|
+
}
|
7758
|
+
// Try to fall back to just the byte as a string
|
7759
|
+
const char buf2[2] = { (char)ch, 0 };
|
7760
|
+
return vocab.token_to_id.at(buf2);
|
7466
7761
|
}
|
7762
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
7467
7763
|
case LLAMA_VOCAB_TYPE_BPE: {
|
7468
7764
|
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
7469
7765
|
}
|
@@ -7509,7 +7805,7 @@ struct llm_bigram_spm {
|
|
7509
7805
|
};
|
7510
7806
|
|
7511
7807
|
struct llm_tokenizer_spm {
|
7512
|
-
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
7808
|
+
llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
|
7513
7809
|
|
7514
7810
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
7515
7811
|
// split string into utf8 chars
|
@@ -7584,6 +7880,7 @@ private:
|
|
7584
7880
|
|
7585
7881
|
if (p == rev_merge.end()) {
|
7586
7882
|
// output any symbols that did not form tokens as bytes.
|
7883
|
+
output.reserve(output.size() + symbol.n);
|
7587
7884
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
7588
7885
|
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
7589
7886
|
output.push_back(token_id);
|
@@ -7934,29 +8231,230 @@ private:
|
|
7934
8231
|
llm_bigram_bpe::queue work_queue;
|
7935
8232
|
};
|
7936
8233
|
|
7937
|
-
|
8234
|
+
struct llm_tokenizer_wpm {
|
8235
|
+
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
8236
|
+
|
8237
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
8238
|
+
auto * token_map = &vocab.token_to_id;
|
8239
|
+
|
8240
|
+
// normalize and split by whitespace
|
8241
|
+
std::vector<std::string> words = preprocess(text);
|
8242
|
+
|
8243
|
+
// bos token prepended already
|
8244
|
+
|
8245
|
+
// find the longest tokens that form the words
|
8246
|
+
for (const std::string &word : words) {
|
8247
|
+
// skip empty words
|
8248
|
+
if (word.size() == 0) {
|
8249
|
+
continue;
|
8250
|
+
}
|
8251
|
+
|
8252
|
+
// prepend phantom space
|
8253
|
+
std::string word1 = "\xe2\x96\x81" + word;
|
8254
|
+
int n = word1.size();
|
8255
|
+
|
8256
|
+
// we're at the start of a new word
|
8257
|
+
int i = 0;
|
8258
|
+
bool match_any = false;
|
8259
|
+
|
8260
|
+
// move through character position in word
|
8261
|
+
while (i < n) {
|
8262
|
+
// loop through possible match length
|
8263
|
+
bool match = false;
|
8264
|
+
for (int j = n; j > i; j--) {
|
8265
|
+
auto it = token_map->find(word1.substr(i, j - i));
|
8266
|
+
if (it != token_map->end()) {
|
8267
|
+
output.push_back(it->second);
|
8268
|
+
match = true;
|
8269
|
+
match_any = true;
|
8270
|
+
i = j;
|
8271
|
+
break;
|
8272
|
+
}
|
8273
|
+
}
|
8274
|
+
|
8275
|
+
// must be an unknown character
|
8276
|
+
if (!match) {
|
8277
|
+
i++;
|
8278
|
+
}
|
8279
|
+
}
|
8280
|
+
|
8281
|
+
// we didn't find any matches for this word
|
8282
|
+
if (!match_any) {
|
8283
|
+
output.push_back(vocab.special_unk_id);
|
8284
|
+
}
|
8285
|
+
}
|
8286
|
+
|
8287
|
+
// append eos token
|
8288
|
+
output.push_back(vocab.special_eos_id);
|
8289
|
+
}
|
8290
|
+
|
8291
|
+
std::vector<std::string> preprocess(const std::string & text) {
|
8292
|
+
std::string ori_str = normalize(text);
|
8293
|
+
uint64_t ori_size = ori_str.size();
|
8294
|
+
|
8295
|
+
// single punct / single symbol / single digit
|
8296
|
+
// baseline: add whitespace on the left and right of punct and chinese characters
|
8297
|
+
std::vector<std::string> words;
|
8298
|
+
std::string new_str = "";
|
8299
|
+
uint64_t i = 0;
|
8300
|
+
while (i < ori_size) {
|
8301
|
+
int utf_char_len = utf8_len(ori_str[i]);
|
8302
|
+
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
|
8303
|
+
new_str += " ";
|
8304
|
+
new_str += ori_str[i];
|
8305
|
+
new_str += " ";
|
8306
|
+
i += 1;
|
8307
|
+
}
|
8308
|
+
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
|
8309
|
+
new_str += " ";
|
8310
|
+
new_str += ori_str.substr(i, 3);
|
8311
|
+
new_str += " ";
|
8312
|
+
i += 3;
|
8313
|
+
}
|
8314
|
+
else {
|
8315
|
+
new_str += ori_str[i];
|
8316
|
+
i += 1;
|
8317
|
+
}
|
8318
|
+
}
|
8319
|
+
|
8320
|
+
// split by whitespace
|
8321
|
+
uint64_t l = 0;
|
8322
|
+
uint64_t r = 0;
|
8323
|
+
while (r < new_str.size()) {
|
8324
|
+
// if is whitespace
|
8325
|
+
if (isspace(new_str[r])) {
|
8326
|
+
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
8327
|
+
l = r + 1;
|
8328
|
+
r = l;
|
8329
|
+
}
|
8330
|
+
else {
|
8331
|
+
r += 1;
|
8332
|
+
}
|
8333
|
+
}
|
8334
|
+
if (r > l) {
|
8335
|
+
words.push_back(new_str.substr(l, (r - l)));
|
8336
|
+
}
|
8337
|
+
return words;
|
8338
|
+
}
|
8339
|
+
|
8340
|
+
std::string normalize(const std::string & text) {
|
8341
|
+
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
8342
|
+
std::string text2 = strip_accents(text);
|
8343
|
+
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
8344
|
+
char c = text2[i];
|
8345
|
+
if (c >= 'A' && c <= 'Z') {
|
8346
|
+
text2[i] = c - 'A' + 'a';
|
8347
|
+
}
|
8348
|
+
}
|
8349
|
+
return text2;
|
8350
|
+
}
|
8351
|
+
|
8352
|
+
bool is_chinese_char(const std::string & str) {
|
8353
|
+
int len = str.length();
|
8354
|
+
unsigned int codepoint = 0;
|
8355
|
+
int num_bytes = 0;
|
8356
|
+
int i = 0;
|
8357
|
+
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8358
|
+
if (ch <= 0x7f) {
|
8359
|
+
codepoint = ch;
|
8360
|
+
num_bytes = 1;
|
8361
|
+
} else if ((ch >> 5) == 0x06) {
|
8362
|
+
codepoint = ch & 0x1f;
|
8363
|
+
num_bytes = 2;
|
8364
|
+
} else if ((ch >> 4) == 0x0e) {
|
8365
|
+
codepoint = ch & 0x0f;
|
8366
|
+
num_bytes = 3;
|
8367
|
+
} else if ((ch >> 3) == 0x1e) {
|
8368
|
+
codepoint = ch & 0x07;
|
8369
|
+
num_bytes = 4;
|
8370
|
+
}
|
8371
|
+
for (int j = 1; j < num_bytes; ++j) {
|
8372
|
+
if (i + j >= len) {
|
8373
|
+
return false; // incomplete UTF-8 character
|
8374
|
+
}
|
8375
|
+
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8376
|
+
if ((next_ch >> 6) != 0x02) {
|
8377
|
+
return false; // invalid trailing byte
|
8378
|
+
}
|
8379
|
+
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8380
|
+
}
|
8381
|
+
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8382
|
+
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8383
|
+
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
8384
|
+
(codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
|
8385
|
+
(codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
|
8386
|
+
(codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
8387
|
+
(codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
|
8388
|
+
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
|
8389
|
+
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
|
8390
|
+
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
|
8391
|
+
return true; // NOLINT
|
8392
|
+
}
|
8393
|
+
return false;
|
8394
|
+
}
|
8395
|
+
|
8396
|
+
std::string strip_accents(const std::string & input_string) {
|
8397
|
+
std::string resultString;
|
8398
|
+
std::map<std::string, char> accent_map = {
|
8399
|
+
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8400
|
+
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8401
|
+
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8402
|
+
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8403
|
+
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8404
|
+
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8405
|
+
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8406
|
+
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8407
|
+
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8408
|
+
};
|
8409
|
+
|
8410
|
+
for (size_t i = 0; i < input_string.length();) {
|
8411
|
+
int len = utf8_len(input_string[i]);
|
8412
|
+
std::string curChar = input_string.substr(i, len);
|
8413
|
+
auto iter = accent_map.find(curChar);
|
8414
|
+
if (iter != accent_map.end()) {
|
8415
|
+
resultString += iter->second;
|
8416
|
+
} else {
|
8417
|
+
resultString += curChar;
|
8418
|
+
}
|
8419
|
+
i += len;
|
8420
|
+
}
|
8421
|
+
|
8422
|
+
return resultString;
|
8423
|
+
}
|
8424
|
+
|
8425
|
+
static size_t utf8_len(char src) {
|
8426
|
+
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8427
|
+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8428
|
+
return lookup[highbits];
|
8429
|
+
}
|
8430
|
+
|
8431
|
+
const llama_vocab & vocab;
|
8432
|
+
};
|
8433
|
+
|
8434
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
7938
8435
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
7939
8436
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
7940
8437
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
7941
8438
|
|
7942
|
-
struct fragment_buffer_variant{
|
8439
|
+
struct fragment_buffer_variant {
|
7943
8440
|
fragment_buffer_variant(llama_vocab::id _token)
|
7944
8441
|
:
|
7945
8442
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
7946
8443
|
token(_token),
|
7947
8444
|
raw_text(_dummy),
|
7948
8445
|
offset(0),
|
7949
|
-
length(0){}
|
8446
|
+
length(0) {}
|
8447
|
+
|
7950
8448
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
7951
8449
|
:
|
7952
8450
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
7953
|
-
token((llama_vocab::id)-1),
|
8451
|
+
token((llama_vocab::id) - 1),
|
7954
8452
|
raw_text(_raw_text),
|
7955
8453
|
offset(_offset),
|
7956
8454
|
length(_length){
|
7957
|
-
GGML_ASSERT(
|
7958
|
-
GGML_ASSERT(
|
7959
|
-
GGML_ASSERT(
|
8455
|
+
GGML_ASSERT(_offset >= 0);
|
8456
|
+
GGML_ASSERT(_length >= 1);
|
8457
|
+
GGML_ASSERT(offset + length <= raw_text.length());
|
7960
8458
|
}
|
7961
8459
|
|
7962
8460
|
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
@@ -7969,8 +8467,7 @@ struct fragment_buffer_variant{
|
|
7969
8467
|
|
7970
8468
|
// #define PRETOKENIZERDEBUG
|
7971
8469
|
|
7972
|
-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
7973
|
-
{
|
8470
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
7974
8471
|
// for each special token
|
7975
8472
|
for (const auto & st: vocab.special_tokens_cache) {
|
7976
8473
|
const auto & special_token = st.first;
|
@@ -8081,17 +8578,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8081
8578
|
}
|
8082
8579
|
|
8083
8580
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
8084
|
-
fragment_buffer.emplace_front(
|
8581
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
8085
8582
|
|
8086
|
-
if (special) tokenizer_st_partition(
|
8583
|
+
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
8087
8584
|
|
8088
8585
|
switch (vocab.type) {
|
8089
8586
|
case LLAMA_VOCAB_TYPE_SPM:
|
8090
8587
|
{
|
8091
|
-
for (const auto & fragment: fragment_buffer)
|
8092
|
-
|
8093
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
8094
|
-
{
|
8588
|
+
for (const auto & fragment : fragment_buffer) {
|
8589
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8095
8590
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
8096
8591
|
|
8097
8592
|
// TODO: It's likely possible to get rid of this string copy entirely
|
@@ -8111,19 +8606,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8111
8606
|
llm_tokenizer_spm tokenizer(vocab);
|
8112
8607
|
llama_escape_whitespace(raw_text);
|
8113
8608
|
tokenizer.tokenize(raw_text, output);
|
8114
|
-
}
|
8115
|
-
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8116
|
-
{
|
8609
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8117
8610
|
output.push_back(fragment.token);
|
8118
8611
|
}
|
8119
8612
|
}
|
8120
8613
|
} break;
|
8121
8614
|
case LLAMA_VOCAB_TYPE_BPE:
|
8122
8615
|
{
|
8123
|
-
for (const auto & fragment: fragment_buffer)
|
8124
|
-
|
8125
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
8126
|
-
{
|
8616
|
+
for (const auto & fragment : fragment_buffer) {
|
8617
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8127
8618
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8128
8619
|
|
8129
8620
|
#ifdef PRETOKENIZERDEBUG
|
@@ -8131,9 +8622,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
8131
8622
|
#endif
|
8132
8623
|
llm_tokenizer_bpe tokenizer(vocab);
|
8133
8624
|
tokenizer.tokenize(raw_text, output);
|
8625
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8626
|
+
output.push_back(fragment.token);
|
8134
8627
|
}
|
8135
|
-
|
8136
|
-
|
8628
|
+
}
|
8629
|
+
} break;
|
8630
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
8631
|
+
{
|
8632
|
+
for (const auto & fragment : fragment_buffer) {
|
8633
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8634
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8635
|
+
|
8636
|
+
#ifdef PRETOKENIZERDEBUG
|
8637
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
8638
|
+
#endif
|
8639
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
8640
|
+
tokenizer.tokenize(raw_text, output);
|
8641
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8137
8642
|
output.push_back(fragment.token);
|
8138
8643
|
}
|
8139
8644
|
}
|
@@ -9785,6 +10290,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9785
10290
|
}
|
9786
10291
|
++qs.i_ffn_up;
|
9787
10292
|
}
|
10293
|
+
|
9788
10294
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
9789
10295
|
//}
|
9790
10296
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
@@ -9844,19 +10350,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9844
10350
|
|
9845
10351
|
// K-quants
|
9846
10352
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
9847
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
10353
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
9848
10354
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
9849
10355
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
9850
10356
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
9851
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
10357
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
9852
10358
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
9853
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
10359
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
9854
10360
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
9855
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
9856
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
9857
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9858
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS
|
9859
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10361
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
10362
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10363
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10364
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10365
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
9860
10366
|
|
9861
10367
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9862
10368
|
}
|
@@ -9986,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9986
10492
|
quantize &= !params->only_copy;
|
9987
10493
|
|
9988
10494
|
// do not quantize expert gating tensors
|
9989
|
-
quantize &= name.
|
10495
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
10496
|
+
|
10497
|
+
// do not quantize positional embeddings and token types (BERT)
|
10498
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
10499
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
9990
10500
|
|
9991
10501
|
enum ggml_type new_type;
|
9992
10502
|
void * new_data;
|
@@ -10488,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
|
|
10488
10998
|
/*.logits_all =*/ false,
|
10489
10999
|
/*.embedding =*/ false,
|
10490
11000
|
/*.offload_kqv =*/ true,
|
11001
|
+
/*.do_pooling =*/ true,
|
10491
11002
|
};
|
10492
11003
|
|
10493
11004
|
return result;
|
@@ -10643,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
|
|
10643
11154
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
10644
11155
|
cparams.mul_mat_q = params.mul_mat_q;
|
10645
11156
|
cparams.offload_kqv = params.offload_kqv;
|
11157
|
+
cparams.do_pooling = params.do_pooling;
|
10646
11158
|
|
10647
11159
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
10648
11160
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -10790,14 +11302,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10790
11302
|
// resized during inference, reserve maximum
|
10791
11303
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
10792
11304
|
|
10793
|
-
if (params.embedding){
|
11305
|
+
if (params.embedding) {
|
10794
11306
|
ctx->embedding.resize(hparams.n_embd);
|
10795
11307
|
}
|
10796
11308
|
|
10797
11309
|
// graph inputs
|
10798
11310
|
{
|
10799
11311
|
ggml_init_params init_params = {
|
10800
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
11312
|
+
/* .mem_size */ ggml_tensor_overhead()*7,
|
10801
11313
|
/* .mem_buffer */ nullptr,
|
10802
11314
|
/* .no_alloc */ true,
|
10803
11315
|
};
|
@@ -10808,12 +11320,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10808
11320
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10809
11321
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
10810
11322
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
11323
|
+
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
10811
11324
|
|
10812
11325
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
10813
11326
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
10814
11327
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
10815
11328
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
10816
11329
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11330
|
+
ggml_set_name(ctx->inp_sum, "inp_sum");
|
10817
11331
|
|
10818
11332
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
10819
11333
|
|
@@ -10839,23 +11353,27 @@ struct llama_context * llama_new_context_with_model(
|
|
10839
11353
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
10840
11354
|
|
10841
11355
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
10842
|
-
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
10843
11356
|
|
10844
11357
|
// build worst-case graph
|
10845
11358
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
10846
11359
|
int n_past = cparams.n_ctx - n_tokens;
|
10847
11360
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
10848
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
11361
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
10849
11362
|
|
10850
11363
|
// initialize scheduler with the worst-case graph
|
10851
|
-
|
10852
|
-
|
11364
|
+
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
|
11365
|
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
11366
|
+
llama_free(ctx);
|
11367
|
+
return nullptr;
|
11368
|
+
}
|
10853
11369
|
|
10854
|
-
for (
|
10855
|
-
|
11370
|
+
for (size_t i = 0; i < ctx->backends.size(); i++) {
|
11371
|
+
ggml_backend_t backend = ctx->backends[i];
|
11372
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
11373
|
+
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
10856
11374
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
10857
|
-
|
10858
|
-
|
11375
|
+
ggml_backend_buft_name(buft),
|
11376
|
+
size / 1024.0 / 1024.0);
|
10859
11377
|
}
|
10860
11378
|
|
10861
11379
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
@@ -11660,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
11660
12178
|
return ctx->embedding.data();
|
11661
12179
|
}
|
11662
12180
|
|
12181
|
+
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12182
|
+
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
12183
|
+
}
|
12184
|
+
|
11663
12185
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
11664
12186
|
return model->vocab.id_to_token[token].text.c_str();
|
11665
12187
|
}
|
@@ -11744,6 +12266,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
11744
12266
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
11745
12267
|
if (0 <= token && token < llama_n_vocab(model)) {
|
11746
12268
|
switch (llama_vocab_get_type(model->vocab)) {
|
12269
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
11747
12270
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11748
12271
|
// NOTE: we accept all unsupported token types,
|
11749
12272
|
// suppressing them like CONTROL tokens.
|
@@ -11867,6 +12390,7 @@ const char * llama_print_system_info(void) {
|
|
11867
12390
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
11868
12391
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
11869
12392
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
12393
|
+
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
11870
12394
|
|
11871
12395
|
return s.c_str();
|
11872
12396
|
}
|