llama_cpp 0.14.4 → 0.14.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +11 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +7 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +155 -155
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +878 -216
- data/vendor/tmp/llama.cpp/ggml.c +8 -8
- data/vendor/tmp/llama.cpp/ggml.h +7 -7
- data/vendor/tmp/llama.cpp/llama.cpp +686 -124
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- metadata +2 -2
@@ -261,6 +261,7 @@ enum llm_kv {
|
|
261
261
|
LLM_KV_GENERAL_ALIGNMENT,
|
262
262
|
LLM_KV_GENERAL_NAME,
|
263
263
|
LLM_KV_GENERAL_AUTHOR,
|
264
|
+
LLM_KV_GENERAL_VERSION,
|
264
265
|
LLM_KV_GENERAL_URL,
|
265
266
|
LLM_KV_GENERAL_DESCRIPTION,
|
266
267
|
LLM_KV_GENERAL_LICENSE,
|
@@ -317,6 +318,8 @@ enum llm_kv {
|
|
317
318
|
LLM_KV_TOKENIZER_UNK_ID,
|
318
319
|
LLM_KV_TOKENIZER_SEP_ID,
|
319
320
|
LLM_KV_TOKENIZER_PAD_ID,
|
321
|
+
LLM_KV_TOKENIZER_CLS_ID,
|
322
|
+
LLM_KV_TOKENIZER_MASK_ID,
|
320
323
|
LLM_KV_TOKENIZER_ADD_BOS,
|
321
324
|
LLM_KV_TOKENIZER_ADD_EOS,
|
322
325
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
@@ -330,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
330
333
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
331
334
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
332
335
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
336
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
333
337
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
334
338
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
335
339
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
@@ -386,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
386
390
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
387
391
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
388
392
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
393
|
+
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
394
|
+
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
389
395
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
390
396
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
391
397
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
@@ -924,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
924
930
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
925
931
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
926
932
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
933
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
934
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
927
935
|
},
|
928
936
|
},
|
929
937
|
{
|
@@ -1630,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
|
|
1630
1638
|
#if defined(GGML_USE_CUDA)
|
1631
1639
|
size_t total;
|
1632
1640
|
size_t free;
|
1633
|
-
ggml_backend_cuda_get_device_memory(device, &
|
1641
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1634
1642
|
return free;
|
1635
1643
|
#elif defined(GGML_USE_SYCL)
|
1636
1644
|
size_t total;
|
1637
1645
|
size_t free;
|
1638
|
-
ggml_backend_sycl_get_device_memory(device, &
|
1646
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1639
1647
|
return free;
|
1640
1648
|
#elif defined(GGML_USE_VULKAN)
|
1641
1649
|
size_t total;
|
1642
1650
|
size_t free;
|
1643
|
-
ggml_backend_vk_get_device_memory(device, &
|
1651
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1644
1652
|
return free;
|
1645
1653
|
#else
|
1646
1654
|
return 1;
|
@@ -1697,6 +1705,8 @@ enum e_model {
|
|
1697
1705
|
MODEL_MEDIUM,
|
1698
1706
|
MODEL_LARGE,
|
1699
1707
|
MODEL_XL,
|
1708
|
+
MODEL_8x7B,
|
1709
|
+
MODEL_8x22B,
|
1700
1710
|
};
|
1701
1711
|
|
1702
1712
|
static const size_t kiB = 1024;
|
@@ -2014,11 +2024,13 @@ struct llama_vocab {
|
|
2014
2024
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2015
2025
|
|
2016
2026
|
// default LLaMA special tokens
|
2017
|
-
id special_bos_id
|
2018
|
-
id special_eos_id
|
2019
|
-
id special_unk_id
|
2020
|
-
id special_sep_id
|
2021
|
-
id special_pad_id
|
2027
|
+
id special_bos_id = 1;
|
2028
|
+
id special_eos_id = 2;
|
2029
|
+
id special_unk_id = 0;
|
2030
|
+
id special_sep_id = -1;
|
2031
|
+
id special_pad_id = -1;
|
2032
|
+
id special_cls_id = -1;
|
2033
|
+
id special_mask_id = -1;
|
2022
2034
|
|
2023
2035
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2024
2036
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
@@ -2175,7 +2187,7 @@ struct llama_context {
|
|
2175
2187
|
|
2176
2188
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
2189
|
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
-
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2190
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
2179
2191
|
|
2180
2192
|
bool logits_all = false;
|
2181
2193
|
|
@@ -3548,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) {
|
|
3548
3560
|
case MODEL_MEDIUM: return "0.4B";
|
3549
3561
|
case MODEL_LARGE: return "0.8B";
|
3550
3562
|
case MODEL_XL: return "1.5B";
|
3563
|
+
case MODEL_8x7B: return "8x7B";
|
3564
|
+
case MODEL_8x22B: return "8x22B";
|
3551
3565
|
default: return "?B";
|
3552
3566
|
}
|
3553
3567
|
}
|
@@ -3662,15 +3676,23 @@ static void llm_load_hparams(
|
|
3662
3676
|
{
|
3663
3677
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3664
3678
|
|
3665
|
-
|
3666
|
-
|
3667
|
-
|
3668
|
-
|
3669
|
-
|
3670
|
-
|
3671
|
-
|
3672
|
-
|
3673
|
-
|
3679
|
+
if (hparams.n_expert == 8) {
|
3680
|
+
switch (hparams.n_layer) {
|
3681
|
+
case 32: model.type = e_model::MODEL_8x7B; break;
|
3682
|
+
case 56: model.type = e_model::MODEL_8x22B; break;
|
3683
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3684
|
+
}
|
3685
|
+
} else {
|
3686
|
+
switch (hparams.n_layer) {
|
3687
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
3688
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
3689
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3690
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3691
|
+
case 48: model.type = e_model::MODEL_34B; break;
|
3692
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
3693
|
+
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
3694
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3695
|
+
}
|
3674
3696
|
}
|
3675
3697
|
} break;
|
3676
3698
|
case LLM_ARCH_MINICPM:
|
@@ -3974,7 +3996,9 @@ static void llm_load_hparams(
|
|
3974
3996
|
}
|
3975
3997
|
|
3976
3998
|
// TODO: This should probably be in llama.h
|
3977
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
3999
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4000
|
+
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
4001
|
+
);
|
3978
4002
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
3979
4003
|
|
3980
4004
|
static void llm_load_vocab(
|
@@ -3996,23 +4020,27 @@ static void llm_load_vocab(
|
|
3996
4020
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3997
4021
|
|
3998
4022
|
// default special tokens
|
3999
|
-
vocab.special_bos_id
|
4000
|
-
vocab.special_eos_id
|
4001
|
-
vocab.special_unk_id
|
4002
|
-
vocab.special_sep_id
|
4003
|
-
vocab.special_pad_id
|
4004
|
-
vocab.
|
4023
|
+
vocab.special_bos_id = -1;
|
4024
|
+
vocab.special_eos_id = -1;
|
4025
|
+
vocab.special_unk_id = -1;
|
4026
|
+
vocab.special_sep_id = -1;
|
4027
|
+
vocab.special_pad_id = -1;
|
4028
|
+
vocab.special_cls_id = -1;
|
4029
|
+
vocab.special_mask_id = -1;
|
4030
|
+
vocab.linefeed_id = -1;
|
4005
4031
|
|
4006
4032
|
return;
|
4007
4033
|
} else if (tokenizer_name == "llama") {
|
4008
4034
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4009
4035
|
|
4010
4036
|
// default special tokens
|
4011
|
-
vocab.special_bos_id
|
4012
|
-
vocab.special_eos_id
|
4013
|
-
vocab.special_unk_id
|
4014
|
-
vocab.special_sep_id
|
4015
|
-
vocab.special_pad_id
|
4037
|
+
vocab.special_bos_id = 1;
|
4038
|
+
vocab.special_eos_id = 2;
|
4039
|
+
vocab.special_unk_id = 0;
|
4040
|
+
vocab.special_sep_id = -1;
|
4041
|
+
vocab.special_pad_id = -1;
|
4042
|
+
vocab.special_cls_id = -1;
|
4043
|
+
vocab.special_mask_id = -1;
|
4016
4044
|
|
4017
4045
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4018
4046
|
if (add_space_prefix_keyidx != -1) {
|
@@ -4047,20 +4075,24 @@ static void llm_load_vocab(
|
|
4047
4075
|
}
|
4048
4076
|
|
4049
4077
|
// default special tokens
|
4050
|
-
vocab.special_bos_id
|
4051
|
-
vocab.special_eos_id
|
4052
|
-
vocab.special_unk_id
|
4053
|
-
vocab.special_sep_id
|
4054
|
-
vocab.special_pad_id
|
4078
|
+
vocab.special_bos_id = 11;
|
4079
|
+
vocab.special_eos_id = 11;
|
4080
|
+
vocab.special_unk_id = -1;
|
4081
|
+
vocab.special_sep_id = -1;
|
4082
|
+
vocab.special_pad_id = -1;
|
4083
|
+
vocab.special_cls_id = -1;
|
4084
|
+
vocab.special_mask_id = -1;
|
4055
4085
|
} else if (tokenizer_name == "bert") {
|
4056
4086
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4057
4087
|
|
4058
4088
|
// default special tokens
|
4059
|
-
vocab.special_bos_id
|
4060
|
-
vocab.special_eos_id
|
4061
|
-
vocab.special_unk_id
|
4062
|
-
vocab.special_sep_id
|
4063
|
-
vocab.special_pad_id
|
4089
|
+
vocab.special_bos_id = -1;
|
4090
|
+
vocab.special_eos_id = -1;
|
4091
|
+
vocab.special_unk_id = 100;
|
4092
|
+
vocab.special_sep_id = 102;
|
4093
|
+
vocab.special_pad_id = 0;
|
4094
|
+
vocab.special_cls_id = 101;
|
4095
|
+
vocab.special_mask_id = 103;
|
4064
4096
|
vocab.add_space_prefix = false;
|
4065
4097
|
} else {
|
4066
4098
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
@@ -4123,11 +4155,13 @@ static void llm_load_vocab(
|
|
4123
4155
|
// special tokens
|
4124
4156
|
{
|
4125
4157
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4126
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4127
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4128
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4129
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4130
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4158
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4159
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4160
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4161
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4162
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4163
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4164
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4131
4165
|
};
|
4132
4166
|
for (const auto & it : special_token_types) {
|
4133
4167
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4319,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4319
4353
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4320
4354
|
|
4321
4355
|
// special tokens
|
4322
|
-
if (vocab.special_bos_id
|
4323
|
-
if (vocab.special_eos_id
|
4324
|
-
if (vocab.special_unk_id
|
4325
|
-
if (vocab.special_sep_id
|
4326
|
-
if (vocab.special_pad_id
|
4327
|
-
if (vocab.
|
4356
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4357
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4358
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4359
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4360
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4361
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4362
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4363
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4328
4364
|
}
|
4329
4365
|
|
4330
4366
|
// Returns false if cancelled by progress_callback
|
@@ -5404,6 +5440,11 @@ static bool llm_load_tensors(
|
|
5404
5440
|
|
5405
5441
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5406
5442
|
|
5443
|
+
if (n_layer >= 64){
|
5444
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
5445
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
5446
|
+
}
|
5447
|
+
|
5407
5448
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5408
5449
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5409
5450
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
@@ -9452,6 +9493,31 @@ struct llm_build_context {
|
|
9452
9493
|
cb(Vcur, "Vcur", il);
|
9453
9494
|
}
|
9454
9495
|
|
9496
|
+
if (model.layers[il].attn_q_norm) {
|
9497
|
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
9498
|
+
ggml_element_size(Qcur) * n_embd_head,
|
9499
|
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
9500
|
+
0);
|
9501
|
+
cb(Qcur, "Qcur", il);
|
9502
|
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
9503
|
+
ggml_element_size(Kcur) * n_embd_head,
|
9504
|
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
9505
|
+
0);
|
9506
|
+
cb(Kcur, "Kcur", il);
|
9507
|
+
|
9508
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
9509
|
+
model.layers[il].attn_q_norm,
|
9510
|
+
NULL,
|
9511
|
+
LLM_NORM, cb, il);
|
9512
|
+
cb(Qcur, "Qcur", il);
|
9513
|
+
|
9514
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
9515
|
+
model.layers[il].attn_k_norm,
|
9516
|
+
NULL,
|
9517
|
+
LLM_NORM, cb, il);
|
9518
|
+
cb(Kcur, "Kcur", il);
|
9519
|
+
}
|
9520
|
+
|
9455
9521
|
Qcur = ggml_rope_custom(
|
9456
9522
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9457
9523
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -10409,6 +10475,9 @@ static int llama_decode_internal(
|
|
10409
10475
|
n_outputs_prev += lctx.n_outputs;
|
10410
10476
|
}
|
10411
10477
|
|
10478
|
+
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
10479
|
+
lctx.n_outputs = n_outputs;
|
10480
|
+
|
10412
10481
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
10413
10482
|
//llama_synchronize(&lctx);
|
10414
10483
|
|
@@ -11052,7 +11121,7 @@ struct llm_tokenizer_bpe {
|
|
11052
11121
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
11053
11122
|
}
|
11054
11123
|
|
11055
|
-
// add the
|
11124
|
+
// add the finished tokens to the final list keeping correct order for next and prev
|
11056
11125
|
for (auto & sym : symbols) {
|
11057
11126
|
if (sym.n > 0) {
|
11058
11127
|
sym.prev = final_prev_index;
|
@@ -11321,9 +11390,6 @@ struct llm_tokenizer_wpm {
|
|
11321
11390
|
output.push_back(vocab.special_unk_id);
|
11322
11391
|
}
|
11323
11392
|
}
|
11324
|
-
|
11325
|
-
// append eos token
|
11326
|
-
output.push_back(vocab.special_eos_id);
|
11327
11393
|
}
|
11328
11394
|
|
11329
11395
|
std::vector<std::string> preprocess(const std::string & text) {
|
@@ -11528,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
11528
11594
|
}
|
11529
11595
|
}
|
11530
11596
|
|
11531
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool
|
11597
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
11532
11598
|
std::vector<llama_vocab::id> output;
|
11533
|
-
|
11534
|
-
// OG tokenizer behavior:
|
11535
|
-
//
|
11536
|
-
// tokenizer.encode('', add_bos=True) returns [1]
|
11537
|
-
// tokenizer.encode('', add_bos=False) returns []
|
11538
|
-
|
11539
|
-
if (bos && vocab.special_bos_id != -1) {
|
11540
|
-
output.push_back(vocab.special_bos_id);
|
11541
|
-
}
|
11542
|
-
|
11543
|
-
if (raw_text.empty()) {
|
11544
|
-
return output;
|
11545
|
-
}
|
11546
|
-
|
11547
11599
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
11548
|
-
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11549
11600
|
|
11550
|
-
if (
|
11601
|
+
if (!raw_text.empty()) {
|
11602
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11603
|
+
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
11604
|
+
}
|
11551
11605
|
|
11552
11606
|
switch (vocab.type) {
|
11553
11607
|
case LLAMA_VOCAB_TYPE_SPM:
|
11554
11608
|
{
|
11609
|
+
// OG tokenizer behavior:
|
11610
|
+
//
|
11611
|
+
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
11612
|
+
// tokenizer.encode('', add_special_tokens=False) returns []
|
11613
|
+
|
11614
|
+
if (add_special && vocab.special_add_bos != 0) {
|
11615
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
11616
|
+
output.push_back(vocab.special_bos_id);
|
11617
|
+
}
|
11618
|
+
|
11555
11619
|
for (const auto & fragment : fragment_buffer) {
|
11556
11620
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11557
11621
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
@@ -11577,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11577
11641
|
output.push_back(fragment.token);
|
11578
11642
|
}
|
11579
11643
|
}
|
11644
|
+
|
11645
|
+
if (add_special && vocab.special_add_eos == 1) {
|
11646
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
11647
|
+
output.push_back(vocab.special_eos_id);
|
11648
|
+
}
|
11580
11649
|
} break;
|
11581
11650
|
case LLAMA_VOCAB_TYPE_BPE:
|
11582
11651
|
{
|
11652
|
+
if (add_special && vocab.special_add_bos == 1) {
|
11653
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
11654
|
+
output.push_back(vocab.special_bos_id);
|
11655
|
+
}
|
11656
|
+
|
11583
11657
|
for (const auto & fragment : fragment_buffer) {
|
11584
11658
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11585
11659
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11593,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11593
11667
|
output.push_back(fragment.token);
|
11594
11668
|
}
|
11595
11669
|
}
|
11670
|
+
|
11671
|
+
GGML_ASSERT(vocab.special_add_eos != 1);
|
11596
11672
|
} break;
|
11597
11673
|
case LLAMA_VOCAB_TYPE_WPM:
|
11598
11674
|
{
|
11675
|
+
if (add_special) {
|
11676
|
+
GGML_ASSERT(vocab.special_cls_id != -1);
|
11677
|
+
output.push_back(vocab.special_cls_id);
|
11678
|
+
}
|
11679
|
+
|
11599
11680
|
for (const auto & fragment : fragment_buffer) {
|
11600
11681
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11601
11682
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11609,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11609
11690
|
output.push_back(fragment.token);
|
11610
11691
|
}
|
11611
11692
|
}
|
11693
|
+
|
11694
|
+
if (add_special) {
|
11695
|
+
GGML_ASSERT(vocab.special_sep_id != -1);
|
11696
|
+
output.push_back(vocab.special_sep_id);
|
11697
|
+
}
|
11612
11698
|
} break;
|
11613
11699
|
case LLAMA_VOCAB_TYPE_NONE:
|
11614
11700
|
GGML_ASSERT(false);
|
@@ -11775,7 +11861,9 @@ static void llama_grammar_advance_stack(
|
|
11775
11861
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11776
11862
|
|
11777
11863
|
if (stack.empty()) {
|
11778
|
-
new_stacks.
|
11864
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
11865
|
+
new_stacks.emplace_back(stack);
|
11866
|
+
}
|
11779
11867
|
return;
|
11780
11868
|
}
|
11781
11869
|
|
@@ -11812,7 +11900,10 @@ static void llama_grammar_advance_stack(
|
|
11812
11900
|
}
|
11813
11901
|
case LLAMA_GRETYPE_CHAR:
|
11814
11902
|
case LLAMA_GRETYPE_CHAR_NOT:
|
11815
|
-
new_stacks.
|
11903
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
11904
|
+
// only add the stack if it's not a duplicate of one we already have
|
11905
|
+
new_stacks.emplace_back(stack);
|
11906
|
+
}
|
11816
11907
|
break;
|
11817
11908
|
default:
|
11818
11909
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -11826,12 +11917,13 @@ static void llama_grammar_advance_stack(
|
|
11826
11917
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
11827
11918
|
// produces the N possible stacks if the given char is accepted at those
|
11828
11919
|
// positions
|
11829
|
-
|
11920
|
+
void llama_grammar_accept(
|
11830
11921
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
11831
11922
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
11832
|
-
const uint32_t chr
|
11923
|
+
const uint32_t chr,
|
11924
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11833
11925
|
|
11834
|
-
|
11926
|
+
new_stacks.clear();
|
11835
11927
|
|
11836
11928
|
for (const auto & stack : stacks) {
|
11837
11929
|
if (stack.empty()) {
|
@@ -11850,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
|
11850
11942
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
11851
11943
|
}
|
11852
11944
|
}
|
11853
|
-
|
11854
|
-
return new_stacks;
|
11855
11945
|
}
|
11856
11946
|
|
11857
11947
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
@@ -11865,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11865
11955
|
const std::vector<llama_grammar_candidate> & candidates) {
|
11866
11956
|
|
11867
11957
|
std::vector<llama_grammar_candidate> rejects;
|
11958
|
+
rejects.reserve(candidates.size());
|
11868
11959
|
|
11869
11960
|
if (stack.empty()) {
|
11870
11961
|
for (const auto & tok : candidates) {
|
@@ -11878,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11878
11969
|
const llama_grammar_element * stack_pos = stack.back();
|
11879
11970
|
|
11880
11971
|
std::vector<llama_grammar_candidate> next_candidates;
|
11972
|
+
next_candidates.reserve(candidates.size());
|
11973
|
+
|
11881
11974
|
for (const auto & tok : candidates) {
|
11882
11975
|
if (*tok.code_points == 0) {
|
11883
11976
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
@@ -12685,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
12685
12778
|
// Note terminating 0 in decoded string
|
12686
12779
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
12687
12780
|
const auto & code_points = decoded.first;
|
12781
|
+
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
12688
12782
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
12689
|
-
|
12783
|
+
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
12784
|
+
grammar->stacks = tmp_new_stacks;
|
12690
12785
|
}
|
12691
12786
|
grammar->partial_utf8 = decoded.second;
|
12692
12787
|
GGML_ASSERT(!grammar->stacks.empty());
|
@@ -13318,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
13318
13413
|
return new_type;
|
13319
13414
|
}
|
13320
13415
|
|
13321
|
-
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const
|
13416
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
13322
13417
|
std::mutex mutex;
|
13323
|
-
|
13418
|
+
int64_t counter = 0;
|
13324
13419
|
size_t new_size = 0;
|
13325
13420
|
if (nthread < 2) {
|
13326
13421
|
// single-thread
|
@@ -13328,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13328
13423
|
}
|
13329
13424
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
13330
13425
|
nrows, n_per_row, imatrix]() {
|
13331
|
-
const
|
13426
|
+
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
13332
13427
|
size_t local_size = 0;
|
13333
13428
|
while (true) {
|
13334
13429
|
std::unique_lock<std::mutex> lock(mutex);
|
13335
|
-
|
13430
|
+
int64_t first_row = counter; counter += nrows_per_chunk;
|
13336
13431
|
if (first_row >= nrows) {
|
13337
13432
|
if (local_size > 0) {
|
13338
13433
|
new_size += local_size;
|
@@ -13340,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13340
13435
|
break;
|
13341
13436
|
}
|
13342
13437
|
lock.unlock();
|
13343
|
-
const
|
13438
|
+
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
13344
13439
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
13345
13440
|
}
|
13346
13441
|
};
|
@@ -13463,7 +13558,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13463
13558
|
const std::string name = ggml_get_name(meta);
|
13464
13559
|
|
13465
13560
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
13466
|
-
if (name.find("attn_v.weight")
|
13561
|
+
if (name.find("attn_v.weight") != std::string::npos ||
|
13562
|
+
name.find("attn_qkv.weight") != std::string::npos) {
|
13467
13563
|
++qs.n_attention_wv;
|
13468
13564
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13469
13565
|
qs.has_output = true;
|
@@ -13473,7 +13569,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13473
13569
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
13570
|
|
13475
13571
|
// sanity checks
|
13476
|
-
|
13572
|
+
//
|
13573
|
+
// - qs.n_attention_wv == 0 for Mamba models
|
13574
|
+
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
13575
|
+
//
|
13576
|
+
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
13477
13577
|
|
13478
13578
|
size_t total_size_org = 0;
|
13479
13579
|
size_t total_size_new = 0;
|
@@ -13529,6 +13629,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13529
13629
|
|
13530
13630
|
// quantize only 2D and 3D tensors (experts)
|
13531
13631
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
13632
|
+
|
13633
|
+
// do not quantize norm tensors
|
13634
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
13635
|
+
|
13532
13636
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
13533
13637
|
quantize &= !params->only_copy;
|
13534
13638
|
|
@@ -13557,10 +13661,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13557
13661
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
13558
13662
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
13559
13663
|
}
|
13560
|
-
|
13664
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
13665
|
new_type = params->token_embedding_type;
|
13562
13666
|
}
|
13563
|
-
|
13667
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
13668
|
new_type = params->output_tensor_type;
|
13565
13669
|
}
|
13566
13670
|
|
@@ -13575,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13575
13679
|
new_size = ggml_nbytes(tensor);
|
13576
13680
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
13577
13681
|
} else {
|
13578
|
-
const
|
13682
|
+
const int64_t nelements = ggml_nelements(tensor);
|
13579
13683
|
|
13580
13684
|
const float * imatrix = nullptr;
|
13581
13685
|
if (imatrix_data) {
|
@@ -13627,20 +13731,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13627
13731
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
13628
13732
|
fflush(stdout);
|
13629
13733
|
|
13630
|
-
if (work.size() < nelements * 4) {
|
13734
|
+
if (work.size() < (size_t)nelements * 4) {
|
13631
13735
|
work.resize(nelements * 4); // upper bound on size
|
13632
13736
|
}
|
13633
13737
|
new_data = work.data();
|
13634
13738
|
|
13635
|
-
const
|
13636
|
-
const
|
13739
|
+
const int64_t n_per_row = tensor->ne[0];
|
13740
|
+
const int64_t nrows = tensor->ne[1];
|
13637
13741
|
|
13638
|
-
static const
|
13639
|
-
const
|
13742
|
+
static const int64_t min_chunk_size = 32 * 512;
|
13743
|
+
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
13640
13744
|
|
13641
|
-
const
|
13642
|
-
const
|
13643
|
-
const
|
13745
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13746
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
13747
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
13644
13748
|
|
13645
13749
|
// quantize each expert separately since they have different importance matrices
|
13646
13750
|
new_size = 0;
|
@@ -14905,9 +15009,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
14905
15009
|
llama_kv_cache_update_internal(*ctx);
|
14906
15010
|
}
|
14907
15011
|
|
15012
|
+
// deprecated
|
15013
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
15014
|
+
return llama_state_get_size(ctx);
|
15015
|
+
}
|
15016
|
+
|
15017
|
+
// deprecated
|
15018
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
15019
|
+
return llama_state_get_data(ctx, dst);
|
15020
|
+
}
|
15021
|
+
|
15022
|
+
// deprecated
|
15023
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
15024
|
+
return llama_state_set_data(ctx, src);
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
// deprecated
|
15028
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15029
|
+
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
// deprecated
|
15033
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15034
|
+
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
15035
|
+
}
|
14908
15036
|
|
14909
15037
|
// Returns the *maximum* size of the state
|
14910
|
-
size_t
|
15038
|
+
size_t llama_state_get_size(const struct llama_context * ctx) {
|
14911
15039
|
const auto & cparams = ctx->cparams;
|
14912
15040
|
const auto & hparams = ctx->model.hparams;
|
14913
15041
|
|
@@ -14995,15 +15123,15 @@ struct llama_data_file_context : llama_data_context {
|
|
14995
15123
|
* file context:
|
14996
15124
|
* llama_file file("/path", "wb");
|
14997
15125
|
* llama_data_file_context data_ctx(&file);
|
14998
|
-
*
|
15126
|
+
* llama_state_get_data(ctx, &data_ctx);
|
14999
15127
|
*
|
15000
15128
|
* buffer context:
|
15001
15129
|
* std::vector<uint8_t> buf(max_size, 0);
|
15002
15130
|
* llama_data_buffer_context data_ctx(&buf.data());
|
15003
|
-
*
|
15131
|
+
* llama_state_get_data(ctx, &data_ctx);
|
15004
15132
|
*
|
15005
15133
|
*/
|
15006
|
-
static void
|
15134
|
+
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
15007
15135
|
// copy rng
|
15008
15136
|
{
|
15009
15137
|
std::ostringstream rng_ss;
|
@@ -15147,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
15147
15275
|
}
|
15148
15276
|
}
|
15149
15277
|
|
15150
|
-
size_t
|
15278
|
+
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
15151
15279
|
llama_data_buffer_context data_ctx(dst);
|
15152
|
-
|
15280
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15153
15281
|
|
15154
15282
|
return data_ctx.get_size_written();
|
15155
15283
|
}
|
15156
15284
|
|
15157
15285
|
// Sets the state reading from the specified source address
|
15158
|
-
size_t
|
15286
|
+
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
15159
15287
|
const uint8_t * inp = src;
|
15160
15288
|
|
15161
15289
|
// set rng
|
@@ -15307,14 +15435,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15307
15435
|
}
|
15308
15436
|
|
15309
15437
|
const size_t nread = inp - src;
|
15310
|
-
const size_t max_size =
|
15438
|
+
const size_t max_size = llama_state_get_size(ctx);
|
15311
15439
|
|
15312
15440
|
GGML_ASSERT(nread <= max_size);
|
15313
15441
|
|
15314
15442
|
return nread;
|
15315
15443
|
}
|
15316
15444
|
|
15317
|
-
static bool
|
15445
|
+
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15318
15446
|
llama_file file(path_session, "rb");
|
15319
15447
|
|
15320
15448
|
// sanity checks
|
@@ -15352,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15352
15480
|
// restore the context state
|
15353
15481
|
{
|
15354
15482
|
const size_t n_state_size_cur = file.size - file.tell();
|
15355
|
-
const size_t n_state_size_max =
|
15483
|
+
const size_t n_state_size_max = llama_state_get_size(ctx);
|
15356
15484
|
|
15357
15485
|
if (n_state_size_cur > n_state_size_max) {
|
15358
15486
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
@@ -15362,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15362
15490
|
std::vector<uint8_t> state_data(n_state_size_max);
|
15363
15491
|
file.read_raw(state_data.data(), n_state_size_cur);
|
15364
15492
|
|
15365
|
-
|
15493
|
+
llama_state_set_data(ctx, state_data.data());
|
15366
15494
|
}
|
15367
15495
|
|
15368
15496
|
return true;
|
15369
15497
|
}
|
15370
15498
|
|
15371
|
-
bool
|
15499
|
+
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15372
15500
|
try {
|
15373
|
-
return
|
15501
|
+
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15374
15502
|
} catch (const std::exception & err) {
|
15375
15503
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
15376
15504
|
return false;
|
15377
15505
|
}
|
15378
15506
|
}
|
15379
15507
|
|
15380
|
-
bool
|
15508
|
+
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15381
15509
|
llama_file file(path_session, "wb");
|
15382
15510
|
|
15383
15511
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
@@ -15391,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
15391
15519
|
|
15392
15520
|
// save the context state using stream saving
|
15393
15521
|
llama_data_file_context data_ctx(&file);
|
15394
|
-
|
15522
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15395
15523
|
|
15396
15524
|
return true;
|
15397
15525
|
}
|
15398
15526
|
|
15527
|
+
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15528
|
+
try {
|
15529
|
+
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
15530
|
+
} catch (const std::exception & err) {
|
15531
|
+
LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
|
15532
|
+
return false;
|
15533
|
+
}
|
15534
|
+
}
|
15535
|
+
|
15536
|
+
size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
|
15537
|
+
// save the size of size_t as a uint32_t for safety check
|
15538
|
+
const size_t size_t_size_size = sizeof(uint32_t);
|
15539
|
+
|
15540
|
+
// other values
|
15541
|
+
const size_t s_cell_count_size = sizeof(uint32_t);
|
15542
|
+
const size_t s_layer_count_size = sizeof(uint32_t);
|
15543
|
+
const size_t n_embd_v_gqa_size = sizeof(uint32_t);
|
15544
|
+
|
15545
|
+
size_t s_cell_count = 0;
|
15546
|
+
size_t s_cell_data_size = 0;
|
15547
|
+
const auto & kv_self = ctx->kv_self;
|
15548
|
+
const auto & hparams = ctx->model.hparams;
|
15549
|
+
|
15550
|
+
const uint32_t n_layer = hparams.n_layer;
|
15551
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15552
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15553
|
+
|
15554
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
15555
|
+
const auto & cell = kv_self.cells[i];
|
15556
|
+
if (cell.seq_id.count(seq_id) > 0) {
|
15557
|
+
++s_cell_count;
|
15558
|
+
s_cell_data_size += sizeof(llama_pos);
|
15559
|
+
}
|
15560
|
+
}
|
15561
|
+
|
15562
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15563
|
+
// types of keys and values
|
15564
|
+
s_cell_data_size += sizeof(int32_t) * 2;
|
15565
|
+
// k_size_row and v_size_el values of layer
|
15566
|
+
s_cell_data_size += sizeof(size_t) * 2;
|
15567
|
+
|
15568
|
+
// keys
|
15569
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15570
|
+
s_cell_data_size += k_size_row * s_cell_count;
|
15571
|
+
|
15572
|
+
// values (transposed)
|
15573
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15574
|
+
s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
|
15575
|
+
}
|
15576
|
+
|
15577
|
+
const size_t s_total = (
|
15578
|
+
size_t_size_size +
|
15579
|
+
s_cell_count_size +
|
15580
|
+
s_layer_count_size +
|
15581
|
+
n_embd_v_gqa_size +
|
15582
|
+
s_cell_data_size
|
15583
|
+
);
|
15584
|
+
|
15585
|
+
return s_total;
|
15586
|
+
}
|
15587
|
+
|
15588
|
+
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
15589
|
+
const auto & kv_self = ctx->kv_self;
|
15590
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15591
|
+
|
15592
|
+
// Save the size of size_t as a uint32_t for safety check
|
15593
|
+
const uint32_t size_t_size = sizeof(size_t);
|
15594
|
+
data_ctx.write(&size_t_size, sizeof(size_t_size));
|
15595
|
+
|
15596
|
+
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
15597
|
+
uint32_t cell_count = 0;
|
15598
|
+
|
15599
|
+
// Count the number of cells with the specified seq_id
|
15600
|
+
// Find all the ranges of cells with this seq id
|
15601
|
+
{
|
15602
|
+
uint32_t cell_range_begin = kv_self.size;
|
15603
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
15604
|
+
const auto & cell = kv_self.cells[i];
|
15605
|
+
if (cell.has_seq_id(seq_id)) {
|
15606
|
+
++cell_count;
|
15607
|
+
if (cell_range_begin == kv_self.size) {
|
15608
|
+
cell_range_begin = i;
|
15609
|
+
}
|
15610
|
+
}
|
15611
|
+
else {
|
15612
|
+
if (cell_range_begin != kv_self.size) {
|
15613
|
+
cell_ranges.push_back({ cell_range_begin, i });
|
15614
|
+
cell_range_begin = kv_self.size;
|
15615
|
+
}
|
15616
|
+
}
|
15617
|
+
}
|
15618
|
+
if (cell_range_begin != kv_self.size) {
|
15619
|
+
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
15620
|
+
}
|
15621
|
+
|
15622
|
+
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
15623
|
+
uint32_t cell_count_check = 0;
|
15624
|
+
for (const auto & range : cell_ranges) {
|
15625
|
+
cell_count_check += range.second - range.first;
|
15626
|
+
}
|
15627
|
+
GGML_ASSERT(cell_count == cell_count_check);
|
15628
|
+
}
|
15629
|
+
|
15630
|
+
// Write the cell count
|
15631
|
+
data_ctx.write(&cell_count, sizeof(cell_count));
|
15632
|
+
|
15633
|
+
const auto & hparams = ctx->model.hparams;
|
15634
|
+
const uint32_t n_layer = hparams.n_layer;
|
15635
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15636
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15637
|
+
|
15638
|
+
// Write the layer count
|
15639
|
+
data_ctx.write(&n_layer, sizeof(n_layer));
|
15640
|
+
|
15641
|
+
// Write n_embd_v_gqa
|
15642
|
+
data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
15643
|
+
|
15644
|
+
// Iterate the ranges and write all the pos (this is the token position in the prompt)
|
15645
|
+
for (const auto & range : cell_ranges) {
|
15646
|
+
for (uint32_t i = range.first; i < range.second; ++i) {
|
15647
|
+
const auto & cell = kv_self.cells[i];
|
15648
|
+
data_ctx.write(&cell.pos, sizeof(cell.pos));
|
15649
|
+
}
|
15650
|
+
}
|
15651
|
+
|
15652
|
+
// Iterate and write all the keys first, each row is a cell
|
15653
|
+
// Get whole range at a time
|
15654
|
+
std::vector<uint8_t> tmp_buf;
|
15655
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15656
|
+
// Write key type
|
15657
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
15658
|
+
data_ctx.write(&k_type_i, sizeof(k_type_i));
|
15659
|
+
|
15660
|
+
// Write row size of key
|
15661
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15662
|
+
data_ctx.write(&k_size_row, sizeof(k_size_row));
|
15663
|
+
|
15664
|
+
// Read each range of cells of k_size length each into tmp_buf and write out
|
15665
|
+
for (const auto & range : cell_ranges) {
|
15666
|
+
const size_t range_size = range.second - range.first;
|
15667
|
+
tmp_buf.resize(range_size * k_size_row);
|
15668
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
15669
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
15670
|
+
}
|
15671
|
+
}
|
15672
|
+
|
15673
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
15674
|
+
const uint32_t kv_size = kv_self.size;
|
15675
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15676
|
+
// Write value type
|
15677
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
15678
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
15679
|
+
|
15680
|
+
// Write element size
|
15681
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15682
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
15683
|
+
|
15684
|
+
// For each row, we get the element values of each cell
|
15685
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
15686
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
15687
|
+
for (const auto & range : cell_ranges) {
|
15688
|
+
const size_t range_size = range.second - range.first;
|
15689
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
15690
|
+
tmp_buf.resize(range_size * v_size_el);
|
15691
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
15692
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
15693
|
+
}
|
15694
|
+
}
|
15695
|
+
}
|
15696
|
+
|
15697
|
+
return data_ctx.get_size_written();
|
15698
|
+
}
|
15699
|
+
|
15700
|
+
size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
|
15701
|
+
llama_data_buffer_context data_ctx(dst);
|
15702
|
+
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
15703
|
+
}
|
15704
|
+
|
15705
|
+
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
15706
|
+
auto & kv_self = ctx->kv_self;
|
15707
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15708
|
+
|
15709
|
+
// Wipe the slot
|
15710
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15711
|
+
|
15712
|
+
const uint8_t * inp = src;
|
15713
|
+
|
15714
|
+
// Read size of size_t
|
15715
|
+
uint32_t size_t_size;
|
15716
|
+
memcpy(&size_t_size, inp, sizeof(size_t_size));
|
15717
|
+
inp += sizeof(size_t_size);
|
15718
|
+
if (size_t_size != sizeof(size_t)) {
|
15719
|
+
LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
|
15720
|
+
return 0;
|
15721
|
+
}
|
15722
|
+
|
15723
|
+
// Read the cell count
|
15724
|
+
uint32_t cell_count;
|
15725
|
+
memcpy(&cell_count, inp, sizeof(cell_count));
|
15726
|
+
inp += sizeof(cell_count);
|
15727
|
+
|
15728
|
+
// Read the layer count
|
15729
|
+
uint32_t n_layer_ref;
|
15730
|
+
memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
|
15731
|
+
inp += sizeof(n_layer_ref);
|
15732
|
+
|
15733
|
+
// Read n_embd_v_gqa
|
15734
|
+
uint32_t n_embd_v_gqa_ref;
|
15735
|
+
memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
|
15736
|
+
inp += sizeof(n_embd_v_gqa_ref);
|
15737
|
+
|
15738
|
+
// Sanity check model compatibility
|
15739
|
+
const auto & hparams = ctx->model.hparams;
|
15740
|
+
const uint32_t n_layer = hparams.n_layer;
|
15741
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15742
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15743
|
+
if (n_layer != n_layer_ref) {
|
15744
|
+
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
15745
|
+
return 0;
|
15746
|
+
}
|
15747
|
+
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
15748
|
+
LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
|
15749
|
+
return 0;
|
15750
|
+
}
|
15751
|
+
|
15752
|
+
// Allocate the new cells for the slot
|
15753
|
+
if (cell_count) {
|
15754
|
+
llama_batch batch = llama_batch_init(cell_count, 0, 1);
|
15755
|
+
batch.n_tokens = cell_count;
|
15756
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
15757
|
+
llama_pos pos;
|
15758
|
+
memcpy(&pos, inp, sizeof(pos));
|
15759
|
+
inp += sizeof(pos);
|
15760
|
+
|
15761
|
+
batch.pos[i] = pos;
|
15762
|
+
batch.n_seq_id[i] = 1;
|
15763
|
+
batch.seq_id[i][0] = dest_seq_id;
|
15764
|
+
}
|
15765
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
15766
|
+
llama_batch_free(batch);
|
15767
|
+
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
15768
|
+
return 0;
|
15769
|
+
}
|
15770
|
+
|
15771
|
+
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
15772
|
+
// Assume that this is one contiguous block of cells
|
15773
|
+
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
15774
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
15775
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
15776
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
15777
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
15778
|
+
|
15779
|
+
// Cleanup
|
15780
|
+
llama_batch_free(batch);
|
15781
|
+
}
|
15782
|
+
|
15783
|
+
const uint32_t kv_size = kv_self.size;
|
15784
|
+
const uint32_t kv_head = kv_self.head;
|
15785
|
+
|
15786
|
+
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
|
15787
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15788
|
+
// Read type of key
|
15789
|
+
int32_t k_type_i_ref;
|
15790
|
+
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
|
15791
|
+
inp += sizeof(k_type_i_ref);
|
15792
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
15793
|
+
if (k_type_i != k_type_i_ref) {
|
15794
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15795
|
+
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
15796
|
+
return 0;
|
15797
|
+
}
|
15798
|
+
|
15799
|
+
// Read row size of key
|
15800
|
+
size_t k_size_row_ref;
|
15801
|
+
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
|
15802
|
+
inp += sizeof(k_size_row_ref);
|
15803
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15804
|
+
if (k_size_row != k_size_row_ref) {
|
15805
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15806
|
+
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
|
15807
|
+
return 0;
|
15808
|
+
}
|
15809
|
+
|
15810
|
+
if (cell_count) {
|
15811
|
+
// Read and set the keys for the whole cell range
|
15812
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
|
15813
|
+
inp += cell_count * k_size_row;
|
15814
|
+
}
|
15815
|
+
}
|
15816
|
+
|
15817
|
+
// For each layer, read the values for each cell (transposed)
|
15818
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15819
|
+
// Read type of value
|
15820
|
+
int32_t v_type_i_ref;
|
15821
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
15822
|
+
inp += sizeof(v_type_i_ref);
|
15823
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
15824
|
+
if (v_type_i != v_type_i_ref) {
|
15825
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15826
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
15827
|
+
return 0;
|
15828
|
+
}
|
15829
|
+
|
15830
|
+
// Read element size of value
|
15831
|
+
size_t v_size_el_ref;
|
15832
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
15833
|
+
inp += sizeof(v_size_el_ref);
|
15834
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15835
|
+
if (v_size_el != v_size_el_ref) {
|
15836
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15837
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
15838
|
+
return 0;
|
15839
|
+
}
|
15840
|
+
|
15841
|
+
if (cell_count) {
|
15842
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
15843
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
15844
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
15845
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
15846
|
+
inp += cell_count * v_size_el;
|
15847
|
+
}
|
15848
|
+
}
|
15849
|
+
}
|
15850
|
+
|
15851
|
+
const size_t nread = inp - src;
|
15852
|
+
return nread;
|
15853
|
+
}
|
15854
|
+
|
15855
|
+
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
15856
|
+
llama_file file(filepath, "wb");
|
15857
|
+
|
15858
|
+
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
15859
|
+
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
15860
|
+
|
15861
|
+
// save the prompt
|
15862
|
+
file.write_u32((uint32_t)n_token_count);
|
15863
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
15864
|
+
|
15865
|
+
// save the context state using stream saving
|
15866
|
+
llama_data_file_context data_ctx(&file);
|
15867
|
+
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
15868
|
+
|
15869
|
+
const size_t res = file.tell();
|
15870
|
+
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
15871
|
+
return res;
|
15872
|
+
}
|
15873
|
+
|
15874
|
+
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15875
|
+
llama_file file(filepath, "rb");
|
15876
|
+
|
15877
|
+
// version checks
|
15878
|
+
{
|
15879
|
+
const uint32_t magic = file.read_u32();
|
15880
|
+
const uint32_t version = file.read_u32();
|
15881
|
+
|
15882
|
+
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
15883
|
+
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
15884
|
+
return 0;
|
15885
|
+
}
|
15886
|
+
}
|
15887
|
+
|
15888
|
+
// load the prompt
|
15889
|
+
{
|
15890
|
+
const uint32_t n_token_count = file.read_u32();
|
15891
|
+
|
15892
|
+
if (n_token_count > n_token_capacity) {
|
15893
|
+
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
15894
|
+
return 0;
|
15895
|
+
}
|
15896
|
+
|
15897
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
15898
|
+
*n_token_count_out = n_token_count;
|
15899
|
+
}
|
15900
|
+
|
15901
|
+
// restore the context state
|
15902
|
+
{
|
15903
|
+
const size_t state_size = file.size - file.tell();
|
15904
|
+
std::vector<uint8_t> state_data(state_size);
|
15905
|
+
file.read_raw(state_data.data(), state_size);
|
15906
|
+
const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
|
15907
|
+
if (!nread) {
|
15908
|
+
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
15909
|
+
return 0;
|
15910
|
+
}
|
15911
|
+
GGML_ASSERT(nread <= state_size);
|
15912
|
+
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
15913
|
+
}
|
15914
|
+
|
15915
|
+
return file.tell();
|
15916
|
+
}
|
15917
|
+
|
15918
|
+
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
15919
|
+
try {
|
15920
|
+
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
15921
|
+
} catch (const std::exception & err) {
|
15922
|
+
LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
|
15923
|
+
return 0;
|
15924
|
+
}
|
15925
|
+
}
|
15926
|
+
|
15927
|
+
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15928
|
+
try {
|
15929
|
+
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
15930
|
+
} catch (const std::exception & err) {
|
15931
|
+
LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
|
15932
|
+
return 0;
|
15933
|
+
}
|
15934
|
+
}
|
15935
|
+
|
15399
15936
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
15400
15937
|
ctx->cparams.n_threads = n_threads;
|
15401
15938
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
@@ -15509,23 +16046,31 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
15509
16046
|
}
|
15510
16047
|
|
15511
16048
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
16049
|
+
int32_t j = -1;
|
15512
16050
|
llama_synchronize(ctx);
|
15513
16051
|
|
15514
16052
|
try {
|
15515
16053
|
if (ctx->logits == nullptr) {
|
15516
16054
|
throw std::runtime_error("no logits");
|
15517
16055
|
}
|
15518
|
-
|
16056
|
+
|
16057
|
+
if (i < 0) {
|
16058
|
+
j = ctx->n_outputs + i;
|
16059
|
+
if (j < 0) {
|
16060
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16061
|
+
}
|
16062
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15519
16063
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16064
|
+
} else {
|
16065
|
+
j = ctx->output_ids[i];
|
15520
16066
|
}
|
15521
|
-
const int32_t j = ctx->output_ids[i];
|
15522
16067
|
|
15523
16068
|
if (j < 0) {
|
15524
16069
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
16070
|
}
|
15526
|
-
if (
|
16071
|
+
if (j >= ctx->n_outputs) {
|
15527
16072
|
// This should not happen
|
15528
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16073
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15529
16074
|
}
|
15530
16075
|
|
15531
16076
|
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
@@ -15545,23 +16090,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
15545
16090
|
}
|
15546
16091
|
|
15547
16092
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
16093
|
+
int32_t j = -1;
|
16094
|
+
|
15548
16095
|
llama_synchronize(ctx);
|
15549
16096
|
|
15550
16097
|
try {
|
15551
16098
|
if (ctx->embd == nullptr) {
|
15552
16099
|
throw std::runtime_error("no embeddings");
|
15553
16100
|
}
|
15554
|
-
|
16101
|
+
|
16102
|
+
if (i < 0) {
|
16103
|
+
j = ctx->n_outputs + i;
|
16104
|
+
if (j < 0) {
|
16105
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16106
|
+
}
|
16107
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15555
16108
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16109
|
+
} else {
|
16110
|
+
j = ctx->output_ids[i];
|
15556
16111
|
}
|
15557
|
-
const int32_t j = ctx->output_ids[i];
|
15558
16112
|
|
15559
16113
|
if (j < 0) {
|
15560
16114
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
16115
|
}
|
15562
|
-
if (
|
16116
|
+
if (j >= ctx->n_outputs) {
|
15563
16117
|
// This should not happen
|
15564
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16118
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15565
16119
|
}
|
15566
16120
|
|
15567
16121
|
return ctx->embd + j*ctx->model.hparams.n_embd;
|
@@ -15608,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|
15608
16162
|
return model->vocab.special_eos_id;
|
15609
16163
|
}
|
15610
16164
|
|
16165
|
+
llama_token llama_token_cls(const struct llama_model * model) {
|
16166
|
+
return model->vocab.special_cls_id;
|
16167
|
+
}
|
16168
|
+
|
16169
|
+
llama_token llama_token_sep(const struct llama_model * model) {
|
16170
|
+
return model->vocab.special_sep_id;
|
16171
|
+
}
|
16172
|
+
|
15611
16173
|
llama_token llama_token_nl(const struct llama_model * model) {
|
15612
16174
|
return model->vocab.linefeed_id;
|
15613
16175
|
}
|
@@ -15642,9 +16204,9 @@ int32_t llama_tokenize(
|
|
15642
16204
|
int32_t text_len,
|
15643
16205
|
llama_token * tokens,
|
15644
16206
|
int32_t n_tokens_max,
|
15645
|
-
bool
|
15646
|
-
bool
|
15647
|
-
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len),
|
16207
|
+
bool add_special,
|
16208
|
+
bool parse_special) {
|
16209
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
15648
16210
|
|
15649
16211
|
if (n_tokens_max < (int) res.size()) {
|
15650
16212
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|