llama_cpp 0.14.4 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +11 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +7 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +155 -155
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +878 -216
- data/vendor/tmp/llama.cpp/ggml.c +8 -8
- data/vendor/tmp/llama.cpp/ggml.h +7 -7
- data/vendor/tmp/llama.cpp/llama.cpp +686 -124
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- metadata +2 -2
@@ -261,6 +261,7 @@ enum llm_kv {
|
|
261
261
|
LLM_KV_GENERAL_ALIGNMENT,
|
262
262
|
LLM_KV_GENERAL_NAME,
|
263
263
|
LLM_KV_GENERAL_AUTHOR,
|
264
|
+
LLM_KV_GENERAL_VERSION,
|
264
265
|
LLM_KV_GENERAL_URL,
|
265
266
|
LLM_KV_GENERAL_DESCRIPTION,
|
266
267
|
LLM_KV_GENERAL_LICENSE,
|
@@ -317,6 +318,8 @@ enum llm_kv {
|
|
317
318
|
LLM_KV_TOKENIZER_UNK_ID,
|
318
319
|
LLM_KV_TOKENIZER_SEP_ID,
|
319
320
|
LLM_KV_TOKENIZER_PAD_ID,
|
321
|
+
LLM_KV_TOKENIZER_CLS_ID,
|
322
|
+
LLM_KV_TOKENIZER_MASK_ID,
|
320
323
|
LLM_KV_TOKENIZER_ADD_BOS,
|
321
324
|
LLM_KV_TOKENIZER_ADD_EOS,
|
322
325
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
@@ -330,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
330
333
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
331
334
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
332
335
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
336
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
333
337
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
334
338
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
335
339
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
@@ -386,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
386
390
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
387
391
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
388
392
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
393
|
+
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
394
|
+
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
389
395
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
390
396
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
391
397
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
@@ -924,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
924
930
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
925
931
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
926
932
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
933
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
934
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
927
935
|
},
|
928
936
|
},
|
929
937
|
{
|
@@ -1630,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
|
|
1630
1638
|
#if defined(GGML_USE_CUDA)
|
1631
1639
|
size_t total;
|
1632
1640
|
size_t free;
|
1633
|
-
ggml_backend_cuda_get_device_memory(device, &
|
1641
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1634
1642
|
return free;
|
1635
1643
|
#elif defined(GGML_USE_SYCL)
|
1636
1644
|
size_t total;
|
1637
1645
|
size_t free;
|
1638
|
-
ggml_backend_sycl_get_device_memory(device, &
|
1646
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1639
1647
|
return free;
|
1640
1648
|
#elif defined(GGML_USE_VULKAN)
|
1641
1649
|
size_t total;
|
1642
1650
|
size_t free;
|
1643
|
-
ggml_backend_vk_get_device_memory(device, &
|
1651
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1644
1652
|
return free;
|
1645
1653
|
#else
|
1646
1654
|
return 1;
|
@@ -1697,6 +1705,8 @@ enum e_model {
|
|
1697
1705
|
MODEL_MEDIUM,
|
1698
1706
|
MODEL_LARGE,
|
1699
1707
|
MODEL_XL,
|
1708
|
+
MODEL_8x7B,
|
1709
|
+
MODEL_8x22B,
|
1700
1710
|
};
|
1701
1711
|
|
1702
1712
|
static const size_t kiB = 1024;
|
@@ -2014,11 +2024,13 @@ struct llama_vocab {
|
|
2014
2024
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2015
2025
|
|
2016
2026
|
// default LLaMA special tokens
|
2017
|
-
id special_bos_id
|
2018
|
-
id special_eos_id
|
2019
|
-
id special_unk_id
|
2020
|
-
id special_sep_id
|
2021
|
-
id special_pad_id
|
2027
|
+
id special_bos_id = 1;
|
2028
|
+
id special_eos_id = 2;
|
2029
|
+
id special_unk_id = 0;
|
2030
|
+
id special_sep_id = -1;
|
2031
|
+
id special_pad_id = -1;
|
2032
|
+
id special_cls_id = -1;
|
2033
|
+
id special_mask_id = -1;
|
2022
2034
|
|
2023
2035
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2024
2036
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
@@ -2175,7 +2187,7 @@ struct llama_context {
|
|
2175
2187
|
|
2176
2188
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
2189
|
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
-
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2190
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
2179
2191
|
|
2180
2192
|
bool logits_all = false;
|
2181
2193
|
|
@@ -3548,6 +3560,8 @@ static const char * llama_model_type_name(e_model type) {
|
|
3548
3560
|
case MODEL_MEDIUM: return "0.4B";
|
3549
3561
|
case MODEL_LARGE: return "0.8B";
|
3550
3562
|
case MODEL_XL: return "1.5B";
|
3563
|
+
case MODEL_8x7B: return "8x7B";
|
3564
|
+
case MODEL_8x22B: return "8x22B";
|
3551
3565
|
default: return "?B";
|
3552
3566
|
}
|
3553
3567
|
}
|
@@ -3662,15 +3676,23 @@ static void llm_load_hparams(
|
|
3662
3676
|
{
|
3663
3677
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3664
3678
|
|
3665
|
-
|
3666
|
-
|
3667
|
-
|
3668
|
-
|
3669
|
-
|
3670
|
-
|
3671
|
-
|
3672
|
-
|
3673
|
-
|
3679
|
+
if (hparams.n_expert == 8) {
|
3680
|
+
switch (hparams.n_layer) {
|
3681
|
+
case 32: model.type = e_model::MODEL_8x7B; break;
|
3682
|
+
case 56: model.type = e_model::MODEL_8x22B; break;
|
3683
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3684
|
+
}
|
3685
|
+
} else {
|
3686
|
+
switch (hparams.n_layer) {
|
3687
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
3688
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
3689
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3690
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3691
|
+
case 48: model.type = e_model::MODEL_34B; break;
|
3692
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
3693
|
+
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
3694
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3695
|
+
}
|
3674
3696
|
}
|
3675
3697
|
} break;
|
3676
3698
|
case LLM_ARCH_MINICPM:
|
@@ -3974,7 +3996,9 @@ static void llm_load_hparams(
|
|
3974
3996
|
}
|
3975
3997
|
|
3976
3998
|
// TODO: This should probably be in llama.h
|
3977
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
3999
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4000
|
+
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
4001
|
+
);
|
3978
4002
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
3979
4003
|
|
3980
4004
|
static void llm_load_vocab(
|
@@ -3996,23 +4020,27 @@ static void llm_load_vocab(
|
|
3996
4020
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3997
4021
|
|
3998
4022
|
// default special tokens
|
3999
|
-
vocab.special_bos_id
|
4000
|
-
vocab.special_eos_id
|
4001
|
-
vocab.special_unk_id
|
4002
|
-
vocab.special_sep_id
|
4003
|
-
vocab.special_pad_id
|
4004
|
-
vocab.
|
4023
|
+
vocab.special_bos_id = -1;
|
4024
|
+
vocab.special_eos_id = -1;
|
4025
|
+
vocab.special_unk_id = -1;
|
4026
|
+
vocab.special_sep_id = -1;
|
4027
|
+
vocab.special_pad_id = -1;
|
4028
|
+
vocab.special_cls_id = -1;
|
4029
|
+
vocab.special_mask_id = -1;
|
4030
|
+
vocab.linefeed_id = -1;
|
4005
4031
|
|
4006
4032
|
return;
|
4007
4033
|
} else if (tokenizer_name == "llama") {
|
4008
4034
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4009
4035
|
|
4010
4036
|
// default special tokens
|
4011
|
-
vocab.special_bos_id
|
4012
|
-
vocab.special_eos_id
|
4013
|
-
vocab.special_unk_id
|
4014
|
-
vocab.special_sep_id
|
4015
|
-
vocab.special_pad_id
|
4037
|
+
vocab.special_bos_id = 1;
|
4038
|
+
vocab.special_eos_id = 2;
|
4039
|
+
vocab.special_unk_id = 0;
|
4040
|
+
vocab.special_sep_id = -1;
|
4041
|
+
vocab.special_pad_id = -1;
|
4042
|
+
vocab.special_cls_id = -1;
|
4043
|
+
vocab.special_mask_id = -1;
|
4016
4044
|
|
4017
4045
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4018
4046
|
if (add_space_prefix_keyidx != -1) {
|
@@ -4047,20 +4075,24 @@ static void llm_load_vocab(
|
|
4047
4075
|
}
|
4048
4076
|
|
4049
4077
|
// default special tokens
|
4050
|
-
vocab.special_bos_id
|
4051
|
-
vocab.special_eos_id
|
4052
|
-
vocab.special_unk_id
|
4053
|
-
vocab.special_sep_id
|
4054
|
-
vocab.special_pad_id
|
4078
|
+
vocab.special_bos_id = 11;
|
4079
|
+
vocab.special_eos_id = 11;
|
4080
|
+
vocab.special_unk_id = -1;
|
4081
|
+
vocab.special_sep_id = -1;
|
4082
|
+
vocab.special_pad_id = -1;
|
4083
|
+
vocab.special_cls_id = -1;
|
4084
|
+
vocab.special_mask_id = -1;
|
4055
4085
|
} else if (tokenizer_name == "bert") {
|
4056
4086
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4057
4087
|
|
4058
4088
|
// default special tokens
|
4059
|
-
vocab.special_bos_id
|
4060
|
-
vocab.special_eos_id
|
4061
|
-
vocab.special_unk_id
|
4062
|
-
vocab.special_sep_id
|
4063
|
-
vocab.special_pad_id
|
4089
|
+
vocab.special_bos_id = -1;
|
4090
|
+
vocab.special_eos_id = -1;
|
4091
|
+
vocab.special_unk_id = 100;
|
4092
|
+
vocab.special_sep_id = 102;
|
4093
|
+
vocab.special_pad_id = 0;
|
4094
|
+
vocab.special_cls_id = 101;
|
4095
|
+
vocab.special_mask_id = 103;
|
4064
4096
|
vocab.add_space_prefix = false;
|
4065
4097
|
} else {
|
4066
4098
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
@@ -4123,11 +4155,13 @@ static void llm_load_vocab(
|
|
4123
4155
|
// special tokens
|
4124
4156
|
{
|
4125
4157
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4126
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4127
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4128
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4129
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4130
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4158
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4159
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4160
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4161
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4162
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4163
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4164
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4131
4165
|
};
|
4132
4166
|
for (const auto & it : special_token_types) {
|
4133
4167
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4319,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4319
4353
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4320
4354
|
|
4321
4355
|
// special tokens
|
4322
|
-
if (vocab.special_bos_id
|
4323
|
-
if (vocab.special_eos_id
|
4324
|
-
if (vocab.special_unk_id
|
4325
|
-
if (vocab.special_sep_id
|
4326
|
-
if (vocab.special_pad_id
|
4327
|
-
if (vocab.
|
4356
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4357
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4358
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4359
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4360
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4361
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4362
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4363
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4328
4364
|
}
|
4329
4365
|
|
4330
4366
|
// Returns false if cancelled by progress_callback
|
@@ -5404,6 +5440,11 @@ static bool llm_load_tensors(
|
|
5404
5440
|
|
5405
5441
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5406
5442
|
|
5443
|
+
if (n_layer >= 64){
|
5444
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
5445
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
5446
|
+
}
|
5447
|
+
|
5407
5448
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5408
5449
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5409
5450
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
@@ -9452,6 +9493,31 @@ struct llm_build_context {
|
|
9452
9493
|
cb(Vcur, "Vcur", il);
|
9453
9494
|
}
|
9454
9495
|
|
9496
|
+
if (model.layers[il].attn_q_norm) {
|
9497
|
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
9498
|
+
ggml_element_size(Qcur) * n_embd_head,
|
9499
|
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
9500
|
+
0);
|
9501
|
+
cb(Qcur, "Qcur", il);
|
9502
|
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
9503
|
+
ggml_element_size(Kcur) * n_embd_head,
|
9504
|
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
9505
|
+
0);
|
9506
|
+
cb(Kcur, "Kcur", il);
|
9507
|
+
|
9508
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
9509
|
+
model.layers[il].attn_q_norm,
|
9510
|
+
NULL,
|
9511
|
+
LLM_NORM, cb, il);
|
9512
|
+
cb(Qcur, "Qcur", il);
|
9513
|
+
|
9514
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
9515
|
+
model.layers[il].attn_k_norm,
|
9516
|
+
NULL,
|
9517
|
+
LLM_NORM, cb, il);
|
9518
|
+
cb(Kcur, "Kcur", il);
|
9519
|
+
}
|
9520
|
+
|
9455
9521
|
Qcur = ggml_rope_custom(
|
9456
9522
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9457
9523
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -10409,6 +10475,9 @@ static int llama_decode_internal(
|
|
10409
10475
|
n_outputs_prev += lctx.n_outputs;
|
10410
10476
|
}
|
10411
10477
|
|
10478
|
+
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
10479
|
+
lctx.n_outputs = n_outputs;
|
10480
|
+
|
10412
10481
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
10413
10482
|
//llama_synchronize(&lctx);
|
10414
10483
|
|
@@ -11052,7 +11121,7 @@ struct llm_tokenizer_bpe {
|
|
11052
11121
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
11053
11122
|
}
|
11054
11123
|
|
11055
|
-
// add the
|
11124
|
+
// add the finished tokens to the final list keeping correct order for next and prev
|
11056
11125
|
for (auto & sym : symbols) {
|
11057
11126
|
if (sym.n > 0) {
|
11058
11127
|
sym.prev = final_prev_index;
|
@@ -11321,9 +11390,6 @@ struct llm_tokenizer_wpm {
|
|
11321
11390
|
output.push_back(vocab.special_unk_id);
|
11322
11391
|
}
|
11323
11392
|
}
|
11324
|
-
|
11325
|
-
// append eos token
|
11326
|
-
output.push_back(vocab.special_eos_id);
|
11327
11393
|
}
|
11328
11394
|
|
11329
11395
|
std::vector<std::string> preprocess(const std::string & text) {
|
@@ -11528,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
11528
11594
|
}
|
11529
11595
|
}
|
11530
11596
|
|
11531
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool
|
11597
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
11532
11598
|
std::vector<llama_vocab::id> output;
|
11533
|
-
|
11534
|
-
// OG tokenizer behavior:
|
11535
|
-
//
|
11536
|
-
// tokenizer.encode('', add_bos=True) returns [1]
|
11537
|
-
// tokenizer.encode('', add_bos=False) returns []
|
11538
|
-
|
11539
|
-
if (bos && vocab.special_bos_id != -1) {
|
11540
|
-
output.push_back(vocab.special_bos_id);
|
11541
|
-
}
|
11542
|
-
|
11543
|
-
if (raw_text.empty()) {
|
11544
|
-
return output;
|
11545
|
-
}
|
11546
|
-
|
11547
11599
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
11548
|
-
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11549
11600
|
|
11550
|
-
if (
|
11601
|
+
if (!raw_text.empty()) {
|
11602
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11603
|
+
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
11604
|
+
}
|
11551
11605
|
|
11552
11606
|
switch (vocab.type) {
|
11553
11607
|
case LLAMA_VOCAB_TYPE_SPM:
|
11554
11608
|
{
|
11609
|
+
// OG tokenizer behavior:
|
11610
|
+
//
|
11611
|
+
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
11612
|
+
// tokenizer.encode('', add_special_tokens=False) returns []
|
11613
|
+
|
11614
|
+
if (add_special && vocab.special_add_bos != 0) {
|
11615
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
11616
|
+
output.push_back(vocab.special_bos_id);
|
11617
|
+
}
|
11618
|
+
|
11555
11619
|
for (const auto & fragment : fragment_buffer) {
|
11556
11620
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11557
11621
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
@@ -11577,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11577
11641
|
output.push_back(fragment.token);
|
11578
11642
|
}
|
11579
11643
|
}
|
11644
|
+
|
11645
|
+
if (add_special && vocab.special_add_eos == 1) {
|
11646
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
11647
|
+
output.push_back(vocab.special_eos_id);
|
11648
|
+
}
|
11580
11649
|
} break;
|
11581
11650
|
case LLAMA_VOCAB_TYPE_BPE:
|
11582
11651
|
{
|
11652
|
+
if (add_special && vocab.special_add_bos == 1) {
|
11653
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
11654
|
+
output.push_back(vocab.special_bos_id);
|
11655
|
+
}
|
11656
|
+
|
11583
11657
|
for (const auto & fragment : fragment_buffer) {
|
11584
11658
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11585
11659
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11593,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11593
11667
|
output.push_back(fragment.token);
|
11594
11668
|
}
|
11595
11669
|
}
|
11670
|
+
|
11671
|
+
GGML_ASSERT(vocab.special_add_eos != 1);
|
11596
11672
|
} break;
|
11597
11673
|
case LLAMA_VOCAB_TYPE_WPM:
|
11598
11674
|
{
|
11675
|
+
if (add_special) {
|
11676
|
+
GGML_ASSERT(vocab.special_cls_id != -1);
|
11677
|
+
output.push_back(vocab.special_cls_id);
|
11678
|
+
}
|
11679
|
+
|
11599
11680
|
for (const auto & fragment : fragment_buffer) {
|
11600
11681
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11601
11682
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11609,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11609
11690
|
output.push_back(fragment.token);
|
11610
11691
|
}
|
11611
11692
|
}
|
11693
|
+
|
11694
|
+
if (add_special) {
|
11695
|
+
GGML_ASSERT(vocab.special_sep_id != -1);
|
11696
|
+
output.push_back(vocab.special_sep_id);
|
11697
|
+
}
|
11612
11698
|
} break;
|
11613
11699
|
case LLAMA_VOCAB_TYPE_NONE:
|
11614
11700
|
GGML_ASSERT(false);
|
@@ -11775,7 +11861,9 @@ static void llama_grammar_advance_stack(
|
|
11775
11861
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11776
11862
|
|
11777
11863
|
if (stack.empty()) {
|
11778
|
-
new_stacks.
|
11864
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
11865
|
+
new_stacks.emplace_back(stack);
|
11866
|
+
}
|
11779
11867
|
return;
|
11780
11868
|
}
|
11781
11869
|
|
@@ -11812,7 +11900,10 @@ static void llama_grammar_advance_stack(
|
|
11812
11900
|
}
|
11813
11901
|
case LLAMA_GRETYPE_CHAR:
|
11814
11902
|
case LLAMA_GRETYPE_CHAR_NOT:
|
11815
|
-
new_stacks.
|
11903
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
11904
|
+
// only add the stack if it's not a duplicate of one we already have
|
11905
|
+
new_stacks.emplace_back(stack);
|
11906
|
+
}
|
11816
11907
|
break;
|
11817
11908
|
default:
|
11818
11909
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -11826,12 +11917,13 @@ static void llama_grammar_advance_stack(
|
|
11826
11917
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
11827
11918
|
// produces the N possible stacks if the given char is accepted at those
|
11828
11919
|
// positions
|
11829
|
-
|
11920
|
+
void llama_grammar_accept(
|
11830
11921
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
11831
11922
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
11832
|
-
const uint32_t chr
|
11923
|
+
const uint32_t chr,
|
11924
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11833
11925
|
|
11834
|
-
|
11926
|
+
new_stacks.clear();
|
11835
11927
|
|
11836
11928
|
for (const auto & stack : stacks) {
|
11837
11929
|
if (stack.empty()) {
|
@@ -11850,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
|
11850
11942
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
11851
11943
|
}
|
11852
11944
|
}
|
11853
|
-
|
11854
|
-
return new_stacks;
|
11855
11945
|
}
|
11856
11946
|
|
11857
11947
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
@@ -11865,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11865
11955
|
const std::vector<llama_grammar_candidate> & candidates) {
|
11866
11956
|
|
11867
11957
|
std::vector<llama_grammar_candidate> rejects;
|
11958
|
+
rejects.reserve(candidates.size());
|
11868
11959
|
|
11869
11960
|
if (stack.empty()) {
|
11870
11961
|
for (const auto & tok : candidates) {
|
@@ -11878,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11878
11969
|
const llama_grammar_element * stack_pos = stack.back();
|
11879
11970
|
|
11880
11971
|
std::vector<llama_grammar_candidate> next_candidates;
|
11972
|
+
next_candidates.reserve(candidates.size());
|
11973
|
+
|
11881
11974
|
for (const auto & tok : candidates) {
|
11882
11975
|
if (*tok.code_points == 0) {
|
11883
11976
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
@@ -12685,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
12685
12778
|
// Note terminating 0 in decoded string
|
12686
12779
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
12687
12780
|
const auto & code_points = decoded.first;
|
12781
|
+
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
12688
12782
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
12689
|
-
|
12783
|
+
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
12784
|
+
grammar->stacks = tmp_new_stacks;
|
12690
12785
|
}
|
12691
12786
|
grammar->partial_utf8 = decoded.second;
|
12692
12787
|
GGML_ASSERT(!grammar->stacks.empty());
|
@@ -13318,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
13318
13413
|
return new_type;
|
13319
13414
|
}
|
13320
13415
|
|
13321
|
-
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const
|
13416
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
13322
13417
|
std::mutex mutex;
|
13323
|
-
|
13418
|
+
int64_t counter = 0;
|
13324
13419
|
size_t new_size = 0;
|
13325
13420
|
if (nthread < 2) {
|
13326
13421
|
// single-thread
|
@@ -13328,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13328
13423
|
}
|
13329
13424
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
13330
13425
|
nrows, n_per_row, imatrix]() {
|
13331
|
-
const
|
13426
|
+
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
13332
13427
|
size_t local_size = 0;
|
13333
13428
|
while (true) {
|
13334
13429
|
std::unique_lock<std::mutex> lock(mutex);
|
13335
|
-
|
13430
|
+
int64_t first_row = counter; counter += nrows_per_chunk;
|
13336
13431
|
if (first_row >= nrows) {
|
13337
13432
|
if (local_size > 0) {
|
13338
13433
|
new_size += local_size;
|
@@ -13340,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13340
13435
|
break;
|
13341
13436
|
}
|
13342
13437
|
lock.unlock();
|
13343
|
-
const
|
13438
|
+
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
13344
13439
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
13345
13440
|
}
|
13346
13441
|
};
|
@@ -13463,7 +13558,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13463
13558
|
const std::string name = ggml_get_name(meta);
|
13464
13559
|
|
13465
13560
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
13466
|
-
if (name.find("attn_v.weight")
|
13561
|
+
if (name.find("attn_v.weight") != std::string::npos ||
|
13562
|
+
name.find("attn_qkv.weight") != std::string::npos) {
|
13467
13563
|
++qs.n_attention_wv;
|
13468
13564
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13469
13565
|
qs.has_output = true;
|
@@ -13473,7 +13569,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13473
13569
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
13570
|
|
13475
13571
|
// sanity checks
|
13476
|
-
|
13572
|
+
//
|
13573
|
+
// - qs.n_attention_wv == 0 for Mamba models
|
13574
|
+
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
13575
|
+
//
|
13576
|
+
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
13477
13577
|
|
13478
13578
|
size_t total_size_org = 0;
|
13479
13579
|
size_t total_size_new = 0;
|
@@ -13529,6 +13629,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13529
13629
|
|
13530
13630
|
// quantize only 2D and 3D tensors (experts)
|
13531
13631
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
13632
|
+
|
13633
|
+
// do not quantize norm tensors
|
13634
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
13635
|
+
|
13532
13636
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
13533
13637
|
quantize &= !params->only_copy;
|
13534
13638
|
|
@@ -13557,10 +13661,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13557
13661
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
13558
13662
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
13559
13663
|
}
|
13560
|
-
|
13664
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
13665
|
new_type = params->token_embedding_type;
|
13562
13666
|
}
|
13563
|
-
|
13667
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
13668
|
new_type = params->output_tensor_type;
|
13565
13669
|
}
|
13566
13670
|
|
@@ -13575,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13575
13679
|
new_size = ggml_nbytes(tensor);
|
13576
13680
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
13577
13681
|
} else {
|
13578
|
-
const
|
13682
|
+
const int64_t nelements = ggml_nelements(tensor);
|
13579
13683
|
|
13580
13684
|
const float * imatrix = nullptr;
|
13581
13685
|
if (imatrix_data) {
|
@@ -13627,20 +13731,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13627
13731
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
13628
13732
|
fflush(stdout);
|
13629
13733
|
|
13630
|
-
if (work.size() < nelements * 4) {
|
13734
|
+
if (work.size() < (size_t)nelements * 4) {
|
13631
13735
|
work.resize(nelements * 4); // upper bound on size
|
13632
13736
|
}
|
13633
13737
|
new_data = work.data();
|
13634
13738
|
|
13635
|
-
const
|
13636
|
-
const
|
13739
|
+
const int64_t n_per_row = tensor->ne[0];
|
13740
|
+
const int64_t nrows = tensor->ne[1];
|
13637
13741
|
|
13638
|
-
static const
|
13639
|
-
const
|
13742
|
+
static const int64_t min_chunk_size = 32 * 512;
|
13743
|
+
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
13640
13744
|
|
13641
|
-
const
|
13642
|
-
const
|
13643
|
-
const
|
13745
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13746
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
13747
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
13644
13748
|
|
13645
13749
|
// quantize each expert separately since they have different importance matrices
|
13646
13750
|
new_size = 0;
|
@@ -14905,9 +15009,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
14905
15009
|
llama_kv_cache_update_internal(*ctx);
|
14906
15010
|
}
|
14907
15011
|
|
15012
|
+
// deprecated
|
15013
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
15014
|
+
return llama_state_get_size(ctx);
|
15015
|
+
}
|
15016
|
+
|
15017
|
+
// deprecated
|
15018
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
15019
|
+
return llama_state_get_data(ctx, dst);
|
15020
|
+
}
|
15021
|
+
|
15022
|
+
// deprecated
|
15023
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
15024
|
+
return llama_state_set_data(ctx, src);
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
// deprecated
|
15028
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15029
|
+
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
// deprecated
|
15033
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15034
|
+
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
15035
|
+
}
|
14908
15036
|
|
14909
15037
|
// Returns the *maximum* size of the state
|
14910
|
-
size_t
|
15038
|
+
size_t llama_state_get_size(const struct llama_context * ctx) {
|
14911
15039
|
const auto & cparams = ctx->cparams;
|
14912
15040
|
const auto & hparams = ctx->model.hparams;
|
14913
15041
|
|
@@ -14995,15 +15123,15 @@ struct llama_data_file_context : llama_data_context {
|
|
14995
15123
|
* file context:
|
14996
15124
|
* llama_file file("/path", "wb");
|
14997
15125
|
* llama_data_file_context data_ctx(&file);
|
14998
|
-
*
|
15126
|
+
* llama_state_get_data(ctx, &data_ctx);
|
14999
15127
|
*
|
15000
15128
|
* buffer context:
|
15001
15129
|
* std::vector<uint8_t> buf(max_size, 0);
|
15002
15130
|
* llama_data_buffer_context data_ctx(&buf.data());
|
15003
|
-
*
|
15131
|
+
* llama_state_get_data(ctx, &data_ctx);
|
15004
15132
|
*
|
15005
15133
|
*/
|
15006
|
-
static void
|
15134
|
+
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
15007
15135
|
// copy rng
|
15008
15136
|
{
|
15009
15137
|
std::ostringstream rng_ss;
|
@@ -15147,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
15147
15275
|
}
|
15148
15276
|
}
|
15149
15277
|
|
15150
|
-
size_t
|
15278
|
+
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
15151
15279
|
llama_data_buffer_context data_ctx(dst);
|
15152
|
-
|
15280
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15153
15281
|
|
15154
15282
|
return data_ctx.get_size_written();
|
15155
15283
|
}
|
15156
15284
|
|
15157
15285
|
// Sets the state reading from the specified source address
|
15158
|
-
size_t
|
15286
|
+
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
15159
15287
|
const uint8_t * inp = src;
|
15160
15288
|
|
15161
15289
|
// set rng
|
@@ -15307,14 +15435,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15307
15435
|
}
|
15308
15436
|
|
15309
15437
|
const size_t nread = inp - src;
|
15310
|
-
const size_t max_size =
|
15438
|
+
const size_t max_size = llama_state_get_size(ctx);
|
15311
15439
|
|
15312
15440
|
GGML_ASSERT(nread <= max_size);
|
15313
15441
|
|
15314
15442
|
return nread;
|
15315
15443
|
}
|
15316
15444
|
|
15317
|
-
static bool
|
15445
|
+
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15318
15446
|
llama_file file(path_session, "rb");
|
15319
15447
|
|
15320
15448
|
// sanity checks
|
@@ -15352,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15352
15480
|
// restore the context state
|
15353
15481
|
{
|
15354
15482
|
const size_t n_state_size_cur = file.size - file.tell();
|
15355
|
-
const size_t n_state_size_max =
|
15483
|
+
const size_t n_state_size_max = llama_state_get_size(ctx);
|
15356
15484
|
|
15357
15485
|
if (n_state_size_cur > n_state_size_max) {
|
15358
15486
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
@@ -15362,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15362
15490
|
std::vector<uint8_t> state_data(n_state_size_max);
|
15363
15491
|
file.read_raw(state_data.data(), n_state_size_cur);
|
15364
15492
|
|
15365
|
-
|
15493
|
+
llama_state_set_data(ctx, state_data.data());
|
15366
15494
|
}
|
15367
15495
|
|
15368
15496
|
return true;
|
15369
15497
|
}
|
15370
15498
|
|
15371
|
-
bool
|
15499
|
+
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15372
15500
|
try {
|
15373
|
-
return
|
15501
|
+
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15374
15502
|
} catch (const std::exception & err) {
|
15375
15503
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
15376
15504
|
return false;
|
15377
15505
|
}
|
15378
15506
|
}
|
15379
15507
|
|
15380
|
-
bool
|
15508
|
+
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15381
15509
|
llama_file file(path_session, "wb");
|
15382
15510
|
|
15383
15511
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
@@ -15391,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
15391
15519
|
|
15392
15520
|
// save the context state using stream saving
|
15393
15521
|
llama_data_file_context data_ctx(&file);
|
15394
|
-
|
15522
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15395
15523
|
|
15396
15524
|
return true;
|
15397
15525
|
}
|
15398
15526
|
|
15527
|
+
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15528
|
+
try {
|
15529
|
+
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
15530
|
+
} catch (const std::exception & err) {
|
15531
|
+
LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
|
15532
|
+
return false;
|
15533
|
+
}
|
15534
|
+
}
|
15535
|
+
|
15536
|
+
size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
|
15537
|
+
// save the size of size_t as a uint32_t for safety check
|
15538
|
+
const size_t size_t_size_size = sizeof(uint32_t);
|
15539
|
+
|
15540
|
+
// other values
|
15541
|
+
const size_t s_cell_count_size = sizeof(uint32_t);
|
15542
|
+
const size_t s_layer_count_size = sizeof(uint32_t);
|
15543
|
+
const size_t n_embd_v_gqa_size = sizeof(uint32_t);
|
15544
|
+
|
15545
|
+
size_t s_cell_count = 0;
|
15546
|
+
size_t s_cell_data_size = 0;
|
15547
|
+
const auto & kv_self = ctx->kv_self;
|
15548
|
+
const auto & hparams = ctx->model.hparams;
|
15549
|
+
|
15550
|
+
const uint32_t n_layer = hparams.n_layer;
|
15551
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15552
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15553
|
+
|
15554
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
15555
|
+
const auto & cell = kv_self.cells[i];
|
15556
|
+
if (cell.seq_id.count(seq_id) > 0) {
|
15557
|
+
++s_cell_count;
|
15558
|
+
s_cell_data_size += sizeof(llama_pos);
|
15559
|
+
}
|
15560
|
+
}
|
15561
|
+
|
15562
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15563
|
+
// types of keys and values
|
15564
|
+
s_cell_data_size += sizeof(int32_t) * 2;
|
15565
|
+
// k_size_row and v_size_el values of layer
|
15566
|
+
s_cell_data_size += sizeof(size_t) * 2;
|
15567
|
+
|
15568
|
+
// keys
|
15569
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15570
|
+
s_cell_data_size += k_size_row * s_cell_count;
|
15571
|
+
|
15572
|
+
// values (transposed)
|
15573
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15574
|
+
s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
|
15575
|
+
}
|
15576
|
+
|
15577
|
+
const size_t s_total = (
|
15578
|
+
size_t_size_size +
|
15579
|
+
s_cell_count_size +
|
15580
|
+
s_layer_count_size +
|
15581
|
+
n_embd_v_gqa_size +
|
15582
|
+
s_cell_data_size
|
15583
|
+
);
|
15584
|
+
|
15585
|
+
return s_total;
|
15586
|
+
}
|
15587
|
+
|
15588
|
+
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
15589
|
+
const auto & kv_self = ctx->kv_self;
|
15590
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15591
|
+
|
15592
|
+
// Save the size of size_t as a uint32_t for safety check
|
15593
|
+
const uint32_t size_t_size = sizeof(size_t);
|
15594
|
+
data_ctx.write(&size_t_size, sizeof(size_t_size));
|
15595
|
+
|
15596
|
+
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
15597
|
+
uint32_t cell_count = 0;
|
15598
|
+
|
15599
|
+
// Count the number of cells with the specified seq_id
|
15600
|
+
// Find all the ranges of cells with this seq id
|
15601
|
+
{
|
15602
|
+
uint32_t cell_range_begin = kv_self.size;
|
15603
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
15604
|
+
const auto & cell = kv_self.cells[i];
|
15605
|
+
if (cell.has_seq_id(seq_id)) {
|
15606
|
+
++cell_count;
|
15607
|
+
if (cell_range_begin == kv_self.size) {
|
15608
|
+
cell_range_begin = i;
|
15609
|
+
}
|
15610
|
+
}
|
15611
|
+
else {
|
15612
|
+
if (cell_range_begin != kv_self.size) {
|
15613
|
+
cell_ranges.push_back({ cell_range_begin, i });
|
15614
|
+
cell_range_begin = kv_self.size;
|
15615
|
+
}
|
15616
|
+
}
|
15617
|
+
}
|
15618
|
+
if (cell_range_begin != kv_self.size) {
|
15619
|
+
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
15620
|
+
}
|
15621
|
+
|
15622
|
+
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
15623
|
+
uint32_t cell_count_check = 0;
|
15624
|
+
for (const auto & range : cell_ranges) {
|
15625
|
+
cell_count_check += range.second - range.first;
|
15626
|
+
}
|
15627
|
+
GGML_ASSERT(cell_count == cell_count_check);
|
15628
|
+
}
|
15629
|
+
|
15630
|
+
// Write the cell count
|
15631
|
+
data_ctx.write(&cell_count, sizeof(cell_count));
|
15632
|
+
|
15633
|
+
const auto & hparams = ctx->model.hparams;
|
15634
|
+
const uint32_t n_layer = hparams.n_layer;
|
15635
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15636
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15637
|
+
|
15638
|
+
// Write the layer count
|
15639
|
+
data_ctx.write(&n_layer, sizeof(n_layer));
|
15640
|
+
|
15641
|
+
// Write n_embd_v_gqa
|
15642
|
+
data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
15643
|
+
|
15644
|
+
// Iterate the ranges and write all the pos (this is the token position in the prompt)
|
15645
|
+
for (const auto & range : cell_ranges) {
|
15646
|
+
for (uint32_t i = range.first; i < range.second; ++i) {
|
15647
|
+
const auto & cell = kv_self.cells[i];
|
15648
|
+
data_ctx.write(&cell.pos, sizeof(cell.pos));
|
15649
|
+
}
|
15650
|
+
}
|
15651
|
+
|
15652
|
+
// Iterate and write all the keys first, each row is a cell
|
15653
|
+
// Get whole range at a time
|
15654
|
+
std::vector<uint8_t> tmp_buf;
|
15655
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15656
|
+
// Write key type
|
15657
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
15658
|
+
data_ctx.write(&k_type_i, sizeof(k_type_i));
|
15659
|
+
|
15660
|
+
// Write row size of key
|
15661
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15662
|
+
data_ctx.write(&k_size_row, sizeof(k_size_row));
|
15663
|
+
|
15664
|
+
// Read each range of cells of k_size length each into tmp_buf and write out
|
15665
|
+
for (const auto & range : cell_ranges) {
|
15666
|
+
const size_t range_size = range.second - range.first;
|
15667
|
+
tmp_buf.resize(range_size * k_size_row);
|
15668
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
15669
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
15670
|
+
}
|
15671
|
+
}
|
15672
|
+
|
15673
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
15674
|
+
const uint32_t kv_size = kv_self.size;
|
15675
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15676
|
+
// Write value type
|
15677
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
15678
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
15679
|
+
|
15680
|
+
// Write element size
|
15681
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15682
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
15683
|
+
|
15684
|
+
// For each row, we get the element values of each cell
|
15685
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
15686
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
15687
|
+
for (const auto & range : cell_ranges) {
|
15688
|
+
const size_t range_size = range.second - range.first;
|
15689
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
15690
|
+
tmp_buf.resize(range_size * v_size_el);
|
15691
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
15692
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
15693
|
+
}
|
15694
|
+
}
|
15695
|
+
}
|
15696
|
+
|
15697
|
+
return data_ctx.get_size_written();
|
15698
|
+
}
|
15699
|
+
|
15700
|
+
size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
|
15701
|
+
llama_data_buffer_context data_ctx(dst);
|
15702
|
+
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
15703
|
+
}
|
15704
|
+
|
15705
|
+
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
15706
|
+
auto & kv_self = ctx->kv_self;
|
15707
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15708
|
+
|
15709
|
+
// Wipe the slot
|
15710
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15711
|
+
|
15712
|
+
const uint8_t * inp = src;
|
15713
|
+
|
15714
|
+
// Read size of size_t
|
15715
|
+
uint32_t size_t_size;
|
15716
|
+
memcpy(&size_t_size, inp, sizeof(size_t_size));
|
15717
|
+
inp += sizeof(size_t_size);
|
15718
|
+
if (size_t_size != sizeof(size_t)) {
|
15719
|
+
LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
|
15720
|
+
return 0;
|
15721
|
+
}
|
15722
|
+
|
15723
|
+
// Read the cell count
|
15724
|
+
uint32_t cell_count;
|
15725
|
+
memcpy(&cell_count, inp, sizeof(cell_count));
|
15726
|
+
inp += sizeof(cell_count);
|
15727
|
+
|
15728
|
+
// Read the layer count
|
15729
|
+
uint32_t n_layer_ref;
|
15730
|
+
memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
|
15731
|
+
inp += sizeof(n_layer_ref);
|
15732
|
+
|
15733
|
+
// Read n_embd_v_gqa
|
15734
|
+
uint32_t n_embd_v_gqa_ref;
|
15735
|
+
memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
|
15736
|
+
inp += sizeof(n_embd_v_gqa_ref);
|
15737
|
+
|
15738
|
+
// Sanity check model compatibility
|
15739
|
+
const auto & hparams = ctx->model.hparams;
|
15740
|
+
const uint32_t n_layer = hparams.n_layer;
|
15741
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15742
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15743
|
+
if (n_layer != n_layer_ref) {
|
15744
|
+
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
15745
|
+
return 0;
|
15746
|
+
}
|
15747
|
+
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
15748
|
+
LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
|
15749
|
+
return 0;
|
15750
|
+
}
|
15751
|
+
|
15752
|
+
// Allocate the new cells for the slot
|
15753
|
+
if (cell_count) {
|
15754
|
+
llama_batch batch = llama_batch_init(cell_count, 0, 1);
|
15755
|
+
batch.n_tokens = cell_count;
|
15756
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
15757
|
+
llama_pos pos;
|
15758
|
+
memcpy(&pos, inp, sizeof(pos));
|
15759
|
+
inp += sizeof(pos);
|
15760
|
+
|
15761
|
+
batch.pos[i] = pos;
|
15762
|
+
batch.n_seq_id[i] = 1;
|
15763
|
+
batch.seq_id[i][0] = dest_seq_id;
|
15764
|
+
}
|
15765
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
15766
|
+
llama_batch_free(batch);
|
15767
|
+
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
15768
|
+
return 0;
|
15769
|
+
}
|
15770
|
+
|
15771
|
+
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
15772
|
+
// Assume that this is one contiguous block of cells
|
15773
|
+
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
15774
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
15775
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
15776
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
15777
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
15778
|
+
|
15779
|
+
// Cleanup
|
15780
|
+
llama_batch_free(batch);
|
15781
|
+
}
|
15782
|
+
|
15783
|
+
const uint32_t kv_size = kv_self.size;
|
15784
|
+
const uint32_t kv_head = kv_self.head;
|
15785
|
+
|
15786
|
+
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
|
15787
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15788
|
+
// Read type of key
|
15789
|
+
int32_t k_type_i_ref;
|
15790
|
+
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
|
15791
|
+
inp += sizeof(k_type_i_ref);
|
15792
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
15793
|
+
if (k_type_i != k_type_i_ref) {
|
15794
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15795
|
+
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
15796
|
+
return 0;
|
15797
|
+
}
|
15798
|
+
|
15799
|
+
// Read row size of key
|
15800
|
+
size_t k_size_row_ref;
|
15801
|
+
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
|
15802
|
+
inp += sizeof(k_size_row_ref);
|
15803
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15804
|
+
if (k_size_row != k_size_row_ref) {
|
15805
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15806
|
+
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
|
15807
|
+
return 0;
|
15808
|
+
}
|
15809
|
+
|
15810
|
+
if (cell_count) {
|
15811
|
+
// Read and set the keys for the whole cell range
|
15812
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
|
15813
|
+
inp += cell_count * k_size_row;
|
15814
|
+
}
|
15815
|
+
}
|
15816
|
+
|
15817
|
+
// For each layer, read the values for each cell (transposed)
|
15818
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15819
|
+
// Read type of value
|
15820
|
+
int32_t v_type_i_ref;
|
15821
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
15822
|
+
inp += sizeof(v_type_i_ref);
|
15823
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
15824
|
+
if (v_type_i != v_type_i_ref) {
|
15825
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15826
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
15827
|
+
return 0;
|
15828
|
+
}
|
15829
|
+
|
15830
|
+
// Read element size of value
|
15831
|
+
size_t v_size_el_ref;
|
15832
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
15833
|
+
inp += sizeof(v_size_el_ref);
|
15834
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15835
|
+
if (v_size_el != v_size_el_ref) {
|
15836
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15837
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
15838
|
+
return 0;
|
15839
|
+
}
|
15840
|
+
|
15841
|
+
if (cell_count) {
|
15842
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
15843
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
15844
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
15845
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
15846
|
+
inp += cell_count * v_size_el;
|
15847
|
+
}
|
15848
|
+
}
|
15849
|
+
}
|
15850
|
+
|
15851
|
+
const size_t nread = inp - src;
|
15852
|
+
return nread;
|
15853
|
+
}
|
15854
|
+
|
15855
|
+
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
15856
|
+
llama_file file(filepath, "wb");
|
15857
|
+
|
15858
|
+
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
15859
|
+
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
15860
|
+
|
15861
|
+
// save the prompt
|
15862
|
+
file.write_u32((uint32_t)n_token_count);
|
15863
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
15864
|
+
|
15865
|
+
// save the context state using stream saving
|
15866
|
+
llama_data_file_context data_ctx(&file);
|
15867
|
+
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
15868
|
+
|
15869
|
+
const size_t res = file.tell();
|
15870
|
+
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
15871
|
+
return res;
|
15872
|
+
}
|
15873
|
+
|
15874
|
+
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15875
|
+
llama_file file(filepath, "rb");
|
15876
|
+
|
15877
|
+
// version checks
|
15878
|
+
{
|
15879
|
+
const uint32_t magic = file.read_u32();
|
15880
|
+
const uint32_t version = file.read_u32();
|
15881
|
+
|
15882
|
+
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
15883
|
+
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
15884
|
+
return 0;
|
15885
|
+
}
|
15886
|
+
}
|
15887
|
+
|
15888
|
+
// load the prompt
|
15889
|
+
{
|
15890
|
+
const uint32_t n_token_count = file.read_u32();
|
15891
|
+
|
15892
|
+
if (n_token_count > n_token_capacity) {
|
15893
|
+
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
15894
|
+
return 0;
|
15895
|
+
}
|
15896
|
+
|
15897
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
15898
|
+
*n_token_count_out = n_token_count;
|
15899
|
+
}
|
15900
|
+
|
15901
|
+
// restore the context state
|
15902
|
+
{
|
15903
|
+
const size_t state_size = file.size - file.tell();
|
15904
|
+
std::vector<uint8_t> state_data(state_size);
|
15905
|
+
file.read_raw(state_data.data(), state_size);
|
15906
|
+
const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
|
15907
|
+
if (!nread) {
|
15908
|
+
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
15909
|
+
return 0;
|
15910
|
+
}
|
15911
|
+
GGML_ASSERT(nread <= state_size);
|
15912
|
+
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
15913
|
+
}
|
15914
|
+
|
15915
|
+
return file.tell();
|
15916
|
+
}
|
15917
|
+
|
15918
|
+
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
15919
|
+
try {
|
15920
|
+
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
15921
|
+
} catch (const std::exception & err) {
|
15922
|
+
LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
|
15923
|
+
return 0;
|
15924
|
+
}
|
15925
|
+
}
|
15926
|
+
|
15927
|
+
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15928
|
+
try {
|
15929
|
+
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
15930
|
+
} catch (const std::exception & err) {
|
15931
|
+
LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
|
15932
|
+
return 0;
|
15933
|
+
}
|
15934
|
+
}
|
15935
|
+
|
15399
15936
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
15400
15937
|
ctx->cparams.n_threads = n_threads;
|
15401
15938
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
@@ -15509,23 +16046,31 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
15509
16046
|
}
|
15510
16047
|
|
15511
16048
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
16049
|
+
int32_t j = -1;
|
15512
16050
|
llama_synchronize(ctx);
|
15513
16051
|
|
15514
16052
|
try {
|
15515
16053
|
if (ctx->logits == nullptr) {
|
15516
16054
|
throw std::runtime_error("no logits");
|
15517
16055
|
}
|
15518
|
-
|
16056
|
+
|
16057
|
+
if (i < 0) {
|
16058
|
+
j = ctx->n_outputs + i;
|
16059
|
+
if (j < 0) {
|
16060
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16061
|
+
}
|
16062
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15519
16063
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16064
|
+
} else {
|
16065
|
+
j = ctx->output_ids[i];
|
15520
16066
|
}
|
15521
|
-
const int32_t j = ctx->output_ids[i];
|
15522
16067
|
|
15523
16068
|
if (j < 0) {
|
15524
16069
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
16070
|
}
|
15526
|
-
if (
|
16071
|
+
if (j >= ctx->n_outputs) {
|
15527
16072
|
// This should not happen
|
15528
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16073
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15529
16074
|
}
|
15530
16075
|
|
15531
16076
|
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
@@ -15545,23 +16090,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
15545
16090
|
}
|
15546
16091
|
|
15547
16092
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
16093
|
+
int32_t j = -1;
|
16094
|
+
|
15548
16095
|
llama_synchronize(ctx);
|
15549
16096
|
|
15550
16097
|
try {
|
15551
16098
|
if (ctx->embd == nullptr) {
|
15552
16099
|
throw std::runtime_error("no embeddings");
|
15553
16100
|
}
|
15554
|
-
|
16101
|
+
|
16102
|
+
if (i < 0) {
|
16103
|
+
j = ctx->n_outputs + i;
|
16104
|
+
if (j < 0) {
|
16105
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16106
|
+
}
|
16107
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15555
16108
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16109
|
+
} else {
|
16110
|
+
j = ctx->output_ids[i];
|
15556
16111
|
}
|
15557
|
-
const int32_t j = ctx->output_ids[i];
|
15558
16112
|
|
15559
16113
|
if (j < 0) {
|
15560
16114
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
16115
|
}
|
15562
|
-
if (
|
16116
|
+
if (j >= ctx->n_outputs) {
|
15563
16117
|
// This should not happen
|
15564
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16118
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15565
16119
|
}
|
15566
16120
|
|
15567
16121
|
return ctx->embd + j*ctx->model.hparams.n_embd;
|
@@ -15608,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|
15608
16162
|
return model->vocab.special_eos_id;
|
15609
16163
|
}
|
15610
16164
|
|
16165
|
+
llama_token llama_token_cls(const struct llama_model * model) {
|
16166
|
+
return model->vocab.special_cls_id;
|
16167
|
+
}
|
16168
|
+
|
16169
|
+
llama_token llama_token_sep(const struct llama_model * model) {
|
16170
|
+
return model->vocab.special_sep_id;
|
16171
|
+
}
|
16172
|
+
|
15611
16173
|
llama_token llama_token_nl(const struct llama_model * model) {
|
15612
16174
|
return model->vocab.linefeed_id;
|
15613
16175
|
}
|
@@ -15642,9 +16204,9 @@ int32_t llama_tokenize(
|
|
15642
16204
|
int32_t text_len,
|
15643
16205
|
llama_token * tokens,
|
15644
16206
|
int32_t n_tokens_max,
|
15645
|
-
bool
|
15646
|
-
bool
|
15647
|
-
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len),
|
16207
|
+
bool add_special,
|
16208
|
+
bool parse_special) {
|
16209
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
15648
16210
|
|
15649
16211
|
if (n_tokens_max < (int) res.size()) {
|
15650
16212
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|