llama_cpp 0.14.6 → 0.14.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +11 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -293
- data/vendor/tmp/llama.cpp/ggml.c +3 -17
- data/vendor/tmp/llama.cpp/llama.cpp +379 -66
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +404 -553
- metadata +2 -2
@@ -211,6 +211,7 @@ enum llm_arch {
|
|
211
211
|
LLM_ARCH_QWEN2,
|
212
212
|
LLM_ARCH_QWEN2MOE,
|
213
213
|
LLM_ARCH_PHI2,
|
214
|
+
LLM_ARCH_PHI3,
|
214
215
|
LLM_ARCH_PLAMO,
|
215
216
|
LLM_ARCH_CODESHELL,
|
216
217
|
LLM_ARCH_ORION,
|
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
246
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
248
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
248
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
249
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
250
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
251
253
|
{ LLM_ARCH_ORION, "orion" },
|
@@ -793,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
793
795
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
794
796
|
},
|
795
797
|
},
|
798
|
+
{
|
799
|
+
LLM_ARCH_PHI3,
|
800
|
+
{
|
801
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
802
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
803
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
804
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
805
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
806
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
807
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
808
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
809
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
810
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
811
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
812
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
813
|
+
},
|
814
|
+
},
|
796
815
|
{
|
797
816
|
LLM_ARCH_PLAMO,
|
798
817
|
{
|
@@ -1600,12 +1619,12 @@ struct llama_mlock {
|
|
1600
1619
|
};
|
1601
1620
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1602
1621
|
|
1603
|
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1622
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1604
1623
|
std::vector<char> result(8, 0);
|
1605
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1624
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1606
1625
|
if (n_tokens < 0) {
|
1607
1626
|
result.resize(-n_tokens);
|
1608
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1627
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1609
1628
|
GGML_ASSERT(check == -n_tokens);
|
1610
1629
|
}
|
1611
1630
|
else {
|
@@ -2120,7 +2139,7 @@ struct llama_vocab {
|
|
2120
2139
|
id special_prefix_id = -1;
|
2121
2140
|
id special_suffix_id = -1;
|
2122
2141
|
id special_middle_id = -1;
|
2123
|
-
id special_eot_id = -1;
|
2142
|
+
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2124
2143
|
|
2125
2144
|
bool add_space_prefix = true;
|
2126
2145
|
|
@@ -2980,9 +2999,13 @@ struct llama_model_loader {
|
|
2980
2999
|
|
2981
3000
|
ggml_tensor * tensor;
|
2982
3001
|
|
2983
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
3002
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2984
3003
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2985
3004
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
3005
|
+
|
3006
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
3007
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
3008
|
+
}
|
2986
3009
|
}
|
2987
3010
|
};
|
2988
3011
|
std::vector<llama_tensor_weight> weights;
|
@@ -3021,15 +3044,15 @@ struct llama_model_loader {
|
|
3021
3044
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
3022
3045
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
3023
3046
|
|
3047
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3048
|
+
contexts.emplace_back(ctx);
|
3049
|
+
|
3024
3050
|
// Save tensors data offset of the main file.
|
3025
3051
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
3026
3052
|
// so we build a unified tensors index for weights.
|
3027
3053
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3028
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
3054
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
3029
3055
|
}
|
3030
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3031
|
-
contexts.emplace_back(ctx);
|
3032
|
-
|
3033
3056
|
uint16_t n_split = 0;
|
3034
3057
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
3035
3058
|
|
@@ -3063,12 +3086,13 @@ struct llama_model_loader {
|
|
3063
3086
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
3064
3087
|
}
|
3065
3088
|
|
3089
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
3090
|
+
contexts.emplace_back(ctx);
|
3091
|
+
|
3066
3092
|
// Save tensors data offset info of the shard.
|
3067
3093
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3068
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
3094
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
3069
3095
|
}
|
3070
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
3071
|
-
contexts.emplace_back(ctx);
|
3072
3096
|
|
3073
3097
|
gguf_free(ctx_gguf);
|
3074
3098
|
}
|
@@ -3278,6 +3302,10 @@ struct llama_model_loader {
|
|
3278
3302
|
return nullptr;
|
3279
3303
|
}
|
3280
3304
|
|
3305
|
+
const llama_tensor_weight * get_weight(int i) const {
|
3306
|
+
return get_weight(get_tensor_name(i));
|
3307
|
+
}
|
3308
|
+
|
3281
3309
|
const llama_tensor_weight & require_weight(const char * name) const {
|
3282
3310
|
const llama_tensor_weight * weight = get_weight(name);
|
3283
3311
|
if (!weight) {
|
@@ -3770,7 +3798,7 @@ static void llm_load_hparams(
|
|
3770
3798
|
switch (hparams.n_layer) {
|
3771
3799
|
case 22: model.type = e_model::MODEL_1B; break;
|
3772
3800
|
case 26: model.type = e_model::MODEL_3B; break;
|
3773
|
-
case 32: model.type = e_model::MODEL_7B; break;
|
3801
|
+
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
3774
3802
|
case 40: model.type = e_model::MODEL_13B; break;
|
3775
3803
|
case 48: model.type = e_model::MODEL_34B; break;
|
3776
3804
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3955,6 +3983,16 @@ static void llm_load_hparams(
|
|
3955
3983
|
{
|
3956
3984
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3957
3985
|
|
3986
|
+
switch (hparams.n_layer) {
|
3987
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
3988
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
3989
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3990
|
+
}
|
3991
|
+
} break;
|
3992
|
+
case LLM_ARCH_PHI3:
|
3993
|
+
{
|
3994
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3995
|
+
|
3958
3996
|
switch (hparams.n_layer) {
|
3959
3997
|
case 24: model.type = e_model::MODEL_1B; break;
|
3960
3998
|
case 32: model.type = e_model::MODEL_3B; break;
|
@@ -4179,7 +4217,10 @@ static void llm_load_vocab(
|
|
4179
4217
|
vocab.special_prefix_id = 67;
|
4180
4218
|
vocab.special_suffix_id = 69;
|
4181
4219
|
vocab.special_middle_id = 68;
|
4182
|
-
|
4220
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4221
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4222
|
+
//vocab.special_eot_id = 70;
|
4223
|
+
vocab.special_eot_id = 107;
|
4183
4224
|
}
|
4184
4225
|
}
|
4185
4226
|
|
@@ -4308,6 +4349,7 @@ static void llm_load_vocab(
|
|
4308
4349
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
4350
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4310
4351
|
};
|
4352
|
+
|
4311
4353
|
for (const auto & it : special_token_types) {
|
4312
4354
|
const std::string & key = kv(std::get<0>(it));
|
4313
4355
|
int32_t & id = std::get<1>(it);
|
@@ -4322,7 +4364,6 @@ static void llm_load_vocab(
|
|
4322
4364
|
} else {
|
4323
4365
|
id = new_id;
|
4324
4366
|
}
|
4325
|
-
|
4326
4367
|
}
|
4327
4368
|
|
4328
4369
|
// Handle add_bos_token and add_eos_token
|
@@ -4336,6 +4377,28 @@ static void llm_load_vocab(
|
|
4336
4377
|
vocab.special_add_eos = int(temp);
|
4337
4378
|
}
|
4338
4379
|
}
|
4380
|
+
|
4381
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
4382
|
+
//
|
4383
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
4384
|
+
// for now, we apply this workaround to find the EOT token based on its text
|
4385
|
+
if (vocab.special_eot_id == -1) {
|
4386
|
+
for (const auto & t : vocab.token_to_id) {
|
4387
|
+
if (
|
4388
|
+
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
4389
|
+
// need to fix convert script
|
4390
|
+
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
4391
|
+
(t.first == "<|eot_id|>" ||
|
4392
|
+
t.first == "<|im_end|>" ||
|
4393
|
+
t.first == "<|end|>" ||
|
4394
|
+
t.first == "<end_of_turn>"
|
4395
|
+
)
|
4396
|
+
) {
|
4397
|
+
vocab.special_eot_id = t.second;
|
4398
|
+
break;
|
4399
|
+
}
|
4400
|
+
}
|
4401
|
+
}
|
4339
4402
|
}
|
4340
4403
|
|
4341
4404
|
// build special tokens cache
|
@@ -4498,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4498
4561
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4499
4562
|
|
4500
4563
|
// special tokens
|
4501
|
-
if (vocab.special_bos_id
|
4502
|
-
if (vocab.special_eos_id
|
4503
|
-
if (vocab.special_unk_id
|
4504
|
-
if (vocab.special_sep_id
|
4505
|
-
if (vocab.special_pad_id
|
4506
|
-
if (vocab.special_cls_id
|
4507
|
-
if (vocab.special_mask_id
|
4508
|
-
|
4564
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4565
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4566
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4567
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4568
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4569
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4570
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4571
|
+
|
4572
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4573
|
+
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
4574
|
+
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4575
|
+
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4576
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4509
4577
|
}
|
4510
4578
|
|
4511
4579
|
// Returns false if cancelled by progress_callback
|
@@ -5346,6 +5414,33 @@ static bool llm_load_tensors(
|
|
5346
5414
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5347
5415
|
}
|
5348
5416
|
} break;
|
5417
|
+
case LLM_ARCH_PHI3:
|
5418
|
+
{
|
5419
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
5420
|
+
|
5421
|
+
// output
|
5422
|
+
{
|
5423
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
5424
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
5425
|
+
}
|
5426
|
+
|
5427
|
+
for (int i = 0; i < n_layer; ++i) {
|
5428
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
5429
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5430
|
+
|
5431
|
+
auto& layer = model.layers[i];
|
5432
|
+
|
5433
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5434
|
+
|
5435
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
5436
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5437
|
+
|
5438
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5439
|
+
|
5440
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5441
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5442
|
+
}
|
5443
|
+
} break;
|
5349
5444
|
case LLM_ARCH_PLAMO:
|
5350
5445
|
{
|
5351
5446
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -6297,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6297
6392
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6298
6393
|
cb(kq, "kq", il);
|
6299
6394
|
|
6300
|
-
if (model.arch == LLM_ARCH_PHI2) {
|
6395
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6301
6396
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6302
6397
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6303
6398
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -8938,12 +9033,140 @@ struct llm_build_context {
|
|
8938
9033
|
|
8939
9034
|
cur = ggml_add(ctx0, cur, model.output_b);
|
8940
9035
|
cb(cur, "result_output", -1);
|
9036
|
+
ggml_build_forward_expand(gf, cur);
|
9037
|
+
return gf;
|
9038
|
+
}
|
9039
|
+
|
9040
|
+
struct ggml_cgraph * build_phi3() {
|
9041
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
9042
|
+
|
9043
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9044
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
9045
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
9046
|
+
|
9047
|
+
struct ggml_tensor * cur;
|
9048
|
+
struct ggml_tensor * inpL;
|
9049
|
+
|
9050
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
9051
|
+
|
9052
|
+
// inp_pos - contains the positions
|
9053
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
9054
|
+
|
9055
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
9056
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
9057
|
+
|
9058
|
+
for (int il = 0; il < n_layer; ++il) {
|
9059
|
+
auto residual = inpL;
|
9060
|
+
|
9061
|
+
// self-attention
|
9062
|
+
{
|
9063
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9064
|
+
model.layers[il].attn_norm,
|
9065
|
+
NULL,
|
9066
|
+
LLM_NORM_RMS, cb, il);
|
9067
|
+
cb(attn_norm_output, "attn_norm", il);
|
9068
|
+
|
9069
|
+
struct ggml_tensor * Qcur = nullptr;
|
9070
|
+
struct ggml_tensor * Kcur = nullptr;
|
9071
|
+
struct ggml_tensor * Vcur = nullptr;
|
9072
|
+
|
9073
|
+
if (model.layers[il].wqkv) {
|
9074
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
9075
|
+
cb(cur, "wqkv", il);
|
9076
|
+
|
9077
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
9078
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
9079
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
9080
|
+
}
|
9081
|
+
else {
|
9082
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
9083
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
9084
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
9085
|
+
}
|
9086
|
+
|
9087
|
+
cb(Qcur, "Qcur", il);
|
9088
|
+
cb(Kcur, "Kcur", il);
|
9089
|
+
cb(Vcur, "Vcur", il);
|
9090
|
+
|
9091
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9092
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9093
|
+
|
9094
|
+
Qcur = ggml_rope_custom(
|
9095
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9096
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9097
|
+
);
|
9098
|
+
cb(Qcur, "Qcur", il);
|
9099
|
+
|
9100
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9101
|
+
cb(Qcur, "Qcur", il);
|
9102
|
+
|
9103
|
+
Kcur = ggml_rope_custom(
|
9104
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9105
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9106
|
+
);
|
9107
|
+
cb(Kcur, "Kcur", il);
|
9108
|
+
|
9109
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9110
|
+
model.layers[il].wo, NULL,
|
9111
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9112
|
+
}
|
9113
|
+
|
9114
|
+
if (il == n_layer - 1) {
|
9115
|
+
// skip computing output for unused tokens
|
9116
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
9117
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9118
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
9119
|
+
}
|
9120
|
+
|
9121
|
+
cur = ggml_add(ctx0, cur, residual);
|
9122
|
+
residual = cur;
|
9123
|
+
|
9124
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9125
|
+
model.layers[il].ffn_norm, NULL,
|
9126
|
+
LLM_NORM_RMS, cb, il);
|
9127
|
+
cb(cur, "ffn_norm", il);
|
9128
|
+
|
9129
|
+
// FF
|
9130
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
9131
|
+
// TOOD: support into llm_build_ffn
|
9132
|
+
{
|
9133
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
9134
|
+
cb(up, "ffn_up", il);
|
9135
|
+
|
9136
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
9137
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
9138
|
+
|
9139
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
9140
|
+
cb(y, "ffn_gate", il);
|
9141
|
+
|
9142
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
9143
|
+
cb(down, "ffn_down", il);
|
9144
|
+
|
9145
|
+
cur = down;
|
9146
|
+
cb(cur, "ffn_out", il);
|
9147
|
+
}
|
9148
|
+
|
9149
|
+
cur = ggml_add(ctx0, residual, cur);
|
9150
|
+
cb(cur, "l_out", il);
|
9151
|
+
|
9152
|
+
inpL = cur;
|
9153
|
+
}
|
9154
|
+
|
9155
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9156
|
+
model.output_norm,
|
9157
|
+
NULL,
|
9158
|
+
LLM_NORM_RMS, cb, -1);
|
9159
|
+
cb(cur, "result_norm", -1);
|
9160
|
+
|
9161
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
9162
|
+
cb(cur, "result_output", -1);
|
8941
9163
|
|
8942
9164
|
ggml_build_forward_expand(gf, cur);
|
8943
9165
|
|
8944
9166
|
return gf;
|
8945
9167
|
}
|
8946
9168
|
|
9169
|
+
|
8947
9170
|
struct ggml_cgraph * build_plamo() {
|
8948
9171
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
8949
9172
|
|
@@ -10445,6 +10668,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10445
10668
|
{
|
10446
10669
|
result = llm.build_phi2();
|
10447
10670
|
} break;
|
10671
|
+
case LLM_ARCH_PHI3:
|
10672
|
+
{
|
10673
|
+
result = llm.build_phi3();
|
10674
|
+
} break;
|
10448
10675
|
case LLM_ARCH_PLAMO:
|
10449
10676
|
{
|
10450
10677
|
result = llm.build_plamo();
|
@@ -13268,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13268
13495
|
GGML_ASSERT(ctx);
|
13269
13496
|
const int64_t t_start_sample_us = ggml_time_us();
|
13270
13497
|
|
13271
|
-
bool
|
13498
|
+
bool allow_eog = false;
|
13272
13499
|
for (const auto & stack : grammar->stacks) {
|
13273
13500
|
if (stack.empty()) {
|
13274
|
-
|
13501
|
+
allow_eog = true;
|
13275
13502
|
break;
|
13276
13503
|
}
|
13277
13504
|
}
|
13278
13505
|
|
13279
|
-
const llama_token eos = llama_token_eos(&ctx->model);
|
13280
|
-
|
13281
13506
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
13282
13507
|
candidates_decoded.reserve(candidates->size);
|
13283
13508
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
@@ -13285,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13285
13510
|
|
13286
13511
|
for (size_t i = 0; i < candidates->size; ++i) {
|
13287
13512
|
const llama_token id = candidates->data[i].id;
|
13288
|
-
const std::string piece = llama_token_to_piece(ctx, id);
|
13289
|
-
|
13290
|
-
|
13513
|
+
const std::string piece = llama_token_to_piece(ctx, id, false);
|
13514
|
+
|
13515
|
+
if (llama_token_is_eog(&ctx->model, id)) {
|
13516
|
+
if (!allow_eog) {
|
13291
13517
|
candidates->data[i].logit = -INFINITY;
|
13292
13518
|
}
|
13293
13519
|
} else if (piece.empty() || piece[0] == 0) {
|
@@ -13450,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
13450
13676
|
return result;
|
13451
13677
|
}
|
13452
13678
|
|
13453
|
-
llama_token
|
13679
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
13454
13680
|
GGML_ASSERT(ctx);
|
13455
13681
|
|
13456
13682
|
const int64_t t_start_sample_us = ggml_time_us();
|
@@ -13463,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13463
13689
|
}
|
13464
13690
|
|
13465
13691
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
13466
|
-
auto & rng = ctx->rng;
|
13467
13692
|
int idx = dist(rng);
|
13468
13693
|
|
13469
13694
|
llama_token result = candidates->data[idx].id;
|
@@ -13473,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13473
13698
|
return result;
|
13474
13699
|
}
|
13475
13700
|
|
13701
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
13702
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
13703
|
+
}
|
13704
|
+
|
13476
13705
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
13477
13706
|
const int64_t t_start_sample_us = ggml_time_us();
|
13478
13707
|
|
13479
|
-
if (
|
13708
|
+
if (llama_token_is_eog(&ctx->model, token)) {
|
13480
13709
|
for (const auto & stack : grammar->stacks) {
|
13481
13710
|
if (stack.empty()) {
|
13482
13711
|
return;
|
@@ -13485,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
13485
13714
|
GGML_ASSERT(false);
|
13486
13715
|
}
|
13487
13716
|
|
13488
|
-
const std::string piece = llama_token_to_piece(ctx, token);
|
13717
|
+
const std::string piece = llama_token_to_piece(ctx, token, false);
|
13489
13718
|
|
13490
13719
|
// Note terminating 0 in decoded string
|
13491
13720
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14308,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14308
14537
|
std::vector<no_init<uint8_t>> work;
|
14309
14538
|
std::vector<no_init<float>> f32_conv_buf;
|
14310
14539
|
|
14540
|
+
uint16_t n_split = 1;
|
14541
|
+
// Assume split index is continuous
|
14542
|
+
if (params->keep_split) {
|
14543
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
14544
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
14545
|
+
}
|
14546
|
+
}
|
14547
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
14548
|
+
ctx_outs[0] = ctx_out;
|
14549
|
+
|
14311
14550
|
// populate the original tensors so we get an initial meta data
|
14312
14551
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14313
|
-
|
14314
|
-
|
14552
|
+
auto weight = ml.get_weight(i);
|
14553
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
14554
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14555
|
+
if (ctx_outs[i_split] == NULL) {
|
14556
|
+
ctx_outs[i_split] = gguf_init_empty();
|
14557
|
+
}
|
14558
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
14315
14559
|
}
|
14316
14560
|
|
14317
|
-
|
14318
|
-
|
14319
|
-
|
14320
|
-
|
14561
|
+
// Set split info if needed
|
14562
|
+
if (n_split > 1) {
|
14563
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
14564
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
14565
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
14566
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
14567
|
+
}
|
14568
|
+
}
|
14321
14569
|
|
14322
|
-
|
14570
|
+
int cur_split = -1;
|
14571
|
+
std::ofstream fout;
|
14572
|
+
auto close_ofstream = [&]() {
|
14573
|
+
// Write metadata and close file handler
|
14574
|
+
if (fout.is_open()) {
|
14575
|
+
fout.seekp(0);
|
14576
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
14577
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
14578
|
+
fout.write((const char *) data.data(), data.size());
|
14579
|
+
fout.close();
|
14580
|
+
}
|
14581
|
+
};
|
14582
|
+
auto new_ofstream = [&](int index) {
|
14583
|
+
cur_split = index;
|
14584
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
14585
|
+
std::string fname = fname_out;
|
14586
|
+
if (params->keep_split) {
|
14587
|
+
char split_path[PATH_MAX] = {0};
|
14588
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
14589
|
+
fname = std::string(split_path);
|
14590
|
+
}
|
14323
14591
|
|
14324
|
-
|
14325
|
-
|
14592
|
+
fout = std::ofstream(fname, std::ios::binary);
|
14593
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
14594
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
14595
|
+
// placeholder for the meta data
|
14596
|
+
::zeros(fout, meta_size);
|
14597
|
+
};
|
14326
14598
|
|
14327
14599
|
const auto tn = LLM_TN(model.arch);
|
14328
|
-
|
14600
|
+
new_ofstream(0);
|
14329
14601
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14330
|
-
|
14602
|
+
auto weight = ml.get_weight(i);
|
14603
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14604
|
+
if (weight->idx != cur_split && params->keep_split) {
|
14605
|
+
close_ofstream();
|
14606
|
+
new_ofstream(weight->idx);
|
14607
|
+
}
|
14331
14608
|
|
14332
14609
|
const std::string name = ggml_get_name(tensor);
|
14333
14610
|
|
@@ -14482,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14482
14759
|
total_size_new += new_size;
|
14483
14760
|
|
14484
14761
|
// update the gguf meta data as we go
|
14485
|
-
gguf_set_tensor_type(
|
14486
|
-
gguf_set_tensor_data(
|
14762
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
14763
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
14487
14764
|
|
14488
14765
|
// write tensor data + padding
|
14489
14766
|
fout.write((const char *) new_data, new_size);
|
14490
14767
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
14491
14768
|
}
|
14492
|
-
|
14493
|
-
|
14494
|
-
|
14495
|
-
fout.seekp(0);
|
14496
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
14497
|
-
gguf_get_meta_data(ctx_out, data.data());
|
14498
|
-
fout.write((const char *) data.data(), data.size());
|
14769
|
+
close_ofstream();
|
14770
|
+
for (auto & c:ctx_outs) {
|
14771
|
+
gguf_free(c);
|
14499
14772
|
}
|
14500
14773
|
|
14501
|
-
fout.close();
|
14502
|
-
|
14503
|
-
gguf_free(ctx_out);
|
14504
|
-
|
14505
14774
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
14506
14775
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
14507
14776
|
|
@@ -14857,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
14857
15126
|
/*.quantize_output_tensor =*/ true,
|
14858
15127
|
/*.only_copy =*/ false,
|
14859
15128
|
/*.pure =*/ false,
|
15129
|
+
/*.keep_split =*/ false,
|
14860
15130
|
/*.imatrix =*/ nullptr,
|
14861
15131
|
/*.kv_overrides =*/ nullptr,
|
14862
15132
|
};
|
@@ -15365,6 +15635,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15365
15635
|
case LLM_ARCH_QWEN2:
|
15366
15636
|
case LLM_ARCH_QWEN2MOE:
|
15367
15637
|
case LLM_ARCH_PHI2:
|
15638
|
+
case LLM_ARCH_PHI3:
|
15368
15639
|
case LLM_ARCH_GEMMA:
|
15369
15640
|
case LLM_ARCH_STARCODER2:
|
15370
15641
|
return LLAMA_ROPE_TYPE_NEOX;
|
@@ -15378,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15378
15649
|
return LLAMA_ROPE_TYPE_NONE;
|
15379
15650
|
}
|
15380
15651
|
|
15652
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
15653
|
+
return ctx->cparams.pooling_type;
|
15654
|
+
}
|
15655
|
+
|
15381
15656
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
15382
15657
|
return model->hparams.n_vocab;
|
15383
15658
|
}
|
@@ -15856,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
|
|
15856
16131
|
*
|
15857
16132
|
*/
|
15858
16133
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
16134
|
+
llama_synchronize(ctx);
|
16135
|
+
|
15859
16136
|
// copy rng
|
15860
16137
|
{
|
15861
16138
|
std::ostringstream rng_ss;
|
@@ -16008,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
16008
16285
|
|
16009
16286
|
// Sets the state reading from the specified source address
|
16010
16287
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
16288
|
+
llama_synchronize(ctx);
|
16289
|
+
|
16011
16290
|
const uint8_t * inp = src;
|
16012
16291
|
|
16013
16292
|
// set rng
|
@@ -16312,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
16312
16591
|
}
|
16313
16592
|
|
16314
16593
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16594
|
+
llama_synchronize(ctx);
|
16595
|
+
|
16315
16596
|
const auto & kv_self = ctx->kv_self;
|
16316
16597
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16317
16598
|
|
@@ -16429,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
16429
16710
|
}
|
16430
16711
|
|
16431
16712
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16713
|
+
llama_synchronize(ctx);
|
16714
|
+
|
16432
16715
|
auto & kv_self = ctx->kv_self;
|
16433
16716
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16434
16717
|
|
@@ -16880,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|
16880
17163
|
return model->vocab.id_to_token[token].type;
|
16881
17164
|
}
|
16882
17165
|
|
17166
|
+
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
17167
|
+
return token != -1 && (
|
17168
|
+
token == llama_token_eos(model) ||
|
17169
|
+
token == llama_token_eot(model)
|
17170
|
+
);
|
17171
|
+
}
|
17172
|
+
|
16883
17173
|
llama_token llama_token_bos(const struct llama_model * model) {
|
16884
17174
|
return model->vocab.special_bos_id;
|
16885
17175
|
}
|
@@ -16957,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
16957
17247
|
}
|
16958
17248
|
|
16959
17249
|
// does not write null-terminator to buf
|
16960
|
-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
17250
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
16961
17251
|
if (0 <= token && token < llama_n_vocab(model)) {
|
16962
17252
|
switch (llama_vocab_get_type(model->vocab)) {
|
16963
17253
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -16972,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16972
17262
|
}
|
16973
17263
|
memcpy(buf, result.c_str(), result.length());
|
16974
17264
|
return result.length();
|
16975
|
-
} else if (
|
17265
|
+
} else if (
|
17266
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17267
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16976
17268
|
std::string result = model->vocab.id_to_token[token].text;
|
16977
17269
|
if (length < (int) result.length()) {
|
16978
17270
|
return -(int) result.length();
|
@@ -16985,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16985
17277
|
}
|
16986
17278
|
memcpy(buf, "\xe2\x96\x85", 3);
|
16987
17279
|
return 3;
|
16988
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16989
|
-
;
|
16990
17280
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
16991
17281
|
if (length < 1) {
|
16992
17282
|
return -1;
|
@@ -17007,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
17007
17297
|
}
|
17008
17298
|
memcpy(buf, result.c_str(), result.length());
|
17009
17299
|
return result.length();
|
17010
|
-
} else if (
|
17300
|
+
} else if (
|
17301
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17302
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
17011
17303
|
std::string result = model->vocab.id_to_token[token].text;
|
17012
17304
|
if (length < (int) result.length()) {
|
17013
17305
|
return -(int) result.length();
|
17014
17306
|
}
|
17015
17307
|
memcpy(buf, result.c_str(), result.length());
|
17016
17308
|
return result.length();
|
17017
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
17018
|
-
;
|
17019
17309
|
}
|
17020
17310
|
break;
|
17021
17311
|
}
|
@@ -17213,6 +17503,24 @@ static int32_t llama_chat_apply_template_internal(
|
|
17213
17503
|
if (add_ass) {
|
17214
17504
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
17505
|
}
|
17506
|
+
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
17507
|
+
// Llama 3
|
17508
|
+
for (auto message : chat) {
|
17509
|
+
std::string role(message->role);
|
17510
|
+
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
|
17511
|
+
}
|
17512
|
+
if (add_ass) {
|
17513
|
+
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17514
|
+
}
|
17515
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17516
|
+
// Phi 3
|
17517
|
+
for (auto message : chat) {
|
17518
|
+
std::string role(message->role);
|
17519
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17520
|
+
}
|
17521
|
+
if (add_ass) {
|
17522
|
+
ss << "<|assistant|>\n";
|
17523
|
+
}
|
17216
17524
|
} else {
|
17217
17525
|
// template not supported
|
17218
17526
|
return -1;
|
@@ -17345,6 +17653,11 @@ const char * llama_print_system_info(void) {
|
|
17345
17653
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
17346
17654
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17347
17655
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
|
+
#ifdef GGML_USE_LLAMAFILE
|
17657
|
+
s += "LAMMAFILE = 1 | ";
|
17658
|
+
#else
|
17659
|
+
s += "LAMMAFILE = 0 | ";
|
17660
|
+
#endif
|
17348
17661
|
|
17349
17662
|
return s.c_str();
|
17350
17663
|
}
|