llama_cpp 0.14.6 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +11 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -293
- data/vendor/tmp/llama.cpp/ggml.c +3 -17
- data/vendor/tmp/llama.cpp/llama.cpp +379 -66
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +404 -553
- metadata +2 -2
@@ -211,6 +211,7 @@ enum llm_arch {
|
|
211
211
|
LLM_ARCH_QWEN2,
|
212
212
|
LLM_ARCH_QWEN2MOE,
|
213
213
|
LLM_ARCH_PHI2,
|
214
|
+
LLM_ARCH_PHI3,
|
214
215
|
LLM_ARCH_PLAMO,
|
215
216
|
LLM_ARCH_CODESHELL,
|
216
217
|
LLM_ARCH_ORION,
|
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
246
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
248
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
248
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
249
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
250
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
251
253
|
{ LLM_ARCH_ORION, "orion" },
|
@@ -793,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
793
795
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
794
796
|
},
|
795
797
|
},
|
798
|
+
{
|
799
|
+
LLM_ARCH_PHI3,
|
800
|
+
{
|
801
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
802
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
803
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
804
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
805
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
806
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
807
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
808
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
809
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
810
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
811
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
812
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
813
|
+
},
|
814
|
+
},
|
796
815
|
{
|
797
816
|
LLM_ARCH_PLAMO,
|
798
817
|
{
|
@@ -1600,12 +1619,12 @@ struct llama_mlock {
|
|
1600
1619
|
};
|
1601
1620
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1602
1621
|
|
1603
|
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1622
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1604
1623
|
std::vector<char> result(8, 0);
|
1605
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1624
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1606
1625
|
if (n_tokens < 0) {
|
1607
1626
|
result.resize(-n_tokens);
|
1608
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1627
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1609
1628
|
GGML_ASSERT(check == -n_tokens);
|
1610
1629
|
}
|
1611
1630
|
else {
|
@@ -2120,7 +2139,7 @@ struct llama_vocab {
|
|
2120
2139
|
id special_prefix_id = -1;
|
2121
2140
|
id special_suffix_id = -1;
|
2122
2141
|
id special_middle_id = -1;
|
2123
|
-
id special_eot_id = -1;
|
2142
|
+
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2124
2143
|
|
2125
2144
|
bool add_space_prefix = true;
|
2126
2145
|
|
@@ -2980,9 +2999,13 @@ struct llama_model_loader {
|
|
2980
2999
|
|
2981
3000
|
ggml_tensor * tensor;
|
2982
3001
|
|
2983
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
3002
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2984
3003
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2985
3004
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
3005
|
+
|
3006
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
3007
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
3008
|
+
}
|
2986
3009
|
}
|
2987
3010
|
};
|
2988
3011
|
std::vector<llama_tensor_weight> weights;
|
@@ -3021,15 +3044,15 @@ struct llama_model_loader {
|
|
3021
3044
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
3022
3045
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
3023
3046
|
|
3047
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3048
|
+
contexts.emplace_back(ctx);
|
3049
|
+
|
3024
3050
|
// Save tensors data offset of the main file.
|
3025
3051
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
3026
3052
|
// so we build a unified tensors index for weights.
|
3027
3053
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3028
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
3054
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
3029
3055
|
}
|
3030
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3031
|
-
contexts.emplace_back(ctx);
|
3032
|
-
|
3033
3056
|
uint16_t n_split = 0;
|
3034
3057
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
3035
3058
|
|
@@ -3063,12 +3086,13 @@ struct llama_model_loader {
|
|
3063
3086
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
3064
3087
|
}
|
3065
3088
|
|
3089
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
3090
|
+
contexts.emplace_back(ctx);
|
3091
|
+
|
3066
3092
|
// Save tensors data offset info of the shard.
|
3067
3093
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3068
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
3094
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
3069
3095
|
}
|
3070
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
3071
|
-
contexts.emplace_back(ctx);
|
3072
3096
|
|
3073
3097
|
gguf_free(ctx_gguf);
|
3074
3098
|
}
|
@@ -3278,6 +3302,10 @@ struct llama_model_loader {
|
|
3278
3302
|
return nullptr;
|
3279
3303
|
}
|
3280
3304
|
|
3305
|
+
const llama_tensor_weight * get_weight(int i) const {
|
3306
|
+
return get_weight(get_tensor_name(i));
|
3307
|
+
}
|
3308
|
+
|
3281
3309
|
const llama_tensor_weight & require_weight(const char * name) const {
|
3282
3310
|
const llama_tensor_weight * weight = get_weight(name);
|
3283
3311
|
if (!weight) {
|
@@ -3770,7 +3798,7 @@ static void llm_load_hparams(
|
|
3770
3798
|
switch (hparams.n_layer) {
|
3771
3799
|
case 22: model.type = e_model::MODEL_1B; break;
|
3772
3800
|
case 26: model.type = e_model::MODEL_3B; break;
|
3773
|
-
case 32: model.type = e_model::MODEL_7B; break;
|
3801
|
+
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
3774
3802
|
case 40: model.type = e_model::MODEL_13B; break;
|
3775
3803
|
case 48: model.type = e_model::MODEL_34B; break;
|
3776
3804
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3955,6 +3983,16 @@ static void llm_load_hparams(
|
|
3955
3983
|
{
|
3956
3984
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3957
3985
|
|
3986
|
+
switch (hparams.n_layer) {
|
3987
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
3988
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
3989
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3990
|
+
}
|
3991
|
+
} break;
|
3992
|
+
case LLM_ARCH_PHI3:
|
3993
|
+
{
|
3994
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3995
|
+
|
3958
3996
|
switch (hparams.n_layer) {
|
3959
3997
|
case 24: model.type = e_model::MODEL_1B; break;
|
3960
3998
|
case 32: model.type = e_model::MODEL_3B; break;
|
@@ -4179,7 +4217,10 @@ static void llm_load_vocab(
|
|
4179
4217
|
vocab.special_prefix_id = 67;
|
4180
4218
|
vocab.special_suffix_id = 69;
|
4181
4219
|
vocab.special_middle_id = 68;
|
4182
|
-
|
4220
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4221
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4222
|
+
//vocab.special_eot_id = 70;
|
4223
|
+
vocab.special_eot_id = 107;
|
4183
4224
|
}
|
4184
4225
|
}
|
4185
4226
|
|
@@ -4308,6 +4349,7 @@ static void llm_load_vocab(
|
|
4308
4349
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
4350
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4310
4351
|
};
|
4352
|
+
|
4311
4353
|
for (const auto & it : special_token_types) {
|
4312
4354
|
const std::string & key = kv(std::get<0>(it));
|
4313
4355
|
int32_t & id = std::get<1>(it);
|
@@ -4322,7 +4364,6 @@ static void llm_load_vocab(
|
|
4322
4364
|
} else {
|
4323
4365
|
id = new_id;
|
4324
4366
|
}
|
4325
|
-
|
4326
4367
|
}
|
4327
4368
|
|
4328
4369
|
// Handle add_bos_token and add_eos_token
|
@@ -4336,6 +4377,28 @@ static void llm_load_vocab(
|
|
4336
4377
|
vocab.special_add_eos = int(temp);
|
4337
4378
|
}
|
4338
4379
|
}
|
4380
|
+
|
4381
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
4382
|
+
//
|
4383
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
4384
|
+
// for now, we apply this workaround to find the EOT token based on its text
|
4385
|
+
if (vocab.special_eot_id == -1) {
|
4386
|
+
for (const auto & t : vocab.token_to_id) {
|
4387
|
+
if (
|
4388
|
+
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
4389
|
+
// need to fix convert script
|
4390
|
+
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
4391
|
+
(t.first == "<|eot_id|>" ||
|
4392
|
+
t.first == "<|im_end|>" ||
|
4393
|
+
t.first == "<|end|>" ||
|
4394
|
+
t.first == "<end_of_turn>"
|
4395
|
+
)
|
4396
|
+
) {
|
4397
|
+
vocab.special_eot_id = t.second;
|
4398
|
+
break;
|
4399
|
+
}
|
4400
|
+
}
|
4401
|
+
}
|
4339
4402
|
}
|
4340
4403
|
|
4341
4404
|
// build special tokens cache
|
@@ -4498,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4498
4561
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4499
4562
|
|
4500
4563
|
// special tokens
|
4501
|
-
if (vocab.special_bos_id
|
4502
|
-
if (vocab.special_eos_id
|
4503
|
-
if (vocab.special_unk_id
|
4504
|
-
if (vocab.special_sep_id
|
4505
|
-
if (vocab.special_pad_id
|
4506
|
-
if (vocab.special_cls_id
|
4507
|
-
if (vocab.special_mask_id
|
4508
|
-
|
4564
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4565
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4566
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4567
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4568
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4569
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4570
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4571
|
+
|
4572
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4573
|
+
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
4574
|
+
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4575
|
+
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4576
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4509
4577
|
}
|
4510
4578
|
|
4511
4579
|
// Returns false if cancelled by progress_callback
|
@@ -5346,6 +5414,33 @@ static bool llm_load_tensors(
|
|
5346
5414
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5347
5415
|
}
|
5348
5416
|
} break;
|
5417
|
+
case LLM_ARCH_PHI3:
|
5418
|
+
{
|
5419
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
5420
|
+
|
5421
|
+
// output
|
5422
|
+
{
|
5423
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
5424
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
5425
|
+
}
|
5426
|
+
|
5427
|
+
for (int i = 0; i < n_layer; ++i) {
|
5428
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
5429
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5430
|
+
|
5431
|
+
auto& layer = model.layers[i];
|
5432
|
+
|
5433
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5434
|
+
|
5435
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
5436
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5437
|
+
|
5438
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5439
|
+
|
5440
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5441
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5442
|
+
}
|
5443
|
+
} break;
|
5349
5444
|
case LLM_ARCH_PLAMO:
|
5350
5445
|
{
|
5351
5446
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -6297,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6297
6392
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6298
6393
|
cb(kq, "kq", il);
|
6299
6394
|
|
6300
|
-
if (model.arch == LLM_ARCH_PHI2) {
|
6395
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6301
6396
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6302
6397
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6303
6398
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -8938,12 +9033,140 @@ struct llm_build_context {
|
|
8938
9033
|
|
8939
9034
|
cur = ggml_add(ctx0, cur, model.output_b);
|
8940
9035
|
cb(cur, "result_output", -1);
|
9036
|
+
ggml_build_forward_expand(gf, cur);
|
9037
|
+
return gf;
|
9038
|
+
}
|
9039
|
+
|
9040
|
+
struct ggml_cgraph * build_phi3() {
|
9041
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
9042
|
+
|
9043
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9044
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
9045
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
9046
|
+
|
9047
|
+
struct ggml_tensor * cur;
|
9048
|
+
struct ggml_tensor * inpL;
|
9049
|
+
|
9050
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
9051
|
+
|
9052
|
+
// inp_pos - contains the positions
|
9053
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
9054
|
+
|
9055
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
9056
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
9057
|
+
|
9058
|
+
for (int il = 0; il < n_layer; ++il) {
|
9059
|
+
auto residual = inpL;
|
9060
|
+
|
9061
|
+
// self-attention
|
9062
|
+
{
|
9063
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9064
|
+
model.layers[il].attn_norm,
|
9065
|
+
NULL,
|
9066
|
+
LLM_NORM_RMS, cb, il);
|
9067
|
+
cb(attn_norm_output, "attn_norm", il);
|
9068
|
+
|
9069
|
+
struct ggml_tensor * Qcur = nullptr;
|
9070
|
+
struct ggml_tensor * Kcur = nullptr;
|
9071
|
+
struct ggml_tensor * Vcur = nullptr;
|
9072
|
+
|
9073
|
+
if (model.layers[il].wqkv) {
|
9074
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
9075
|
+
cb(cur, "wqkv", il);
|
9076
|
+
|
9077
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
9078
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
9079
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
9080
|
+
}
|
9081
|
+
else {
|
9082
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
9083
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
9084
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
9085
|
+
}
|
9086
|
+
|
9087
|
+
cb(Qcur, "Qcur", il);
|
9088
|
+
cb(Kcur, "Kcur", il);
|
9089
|
+
cb(Vcur, "Vcur", il);
|
9090
|
+
|
9091
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9092
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9093
|
+
|
9094
|
+
Qcur = ggml_rope_custom(
|
9095
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9096
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9097
|
+
);
|
9098
|
+
cb(Qcur, "Qcur", il);
|
9099
|
+
|
9100
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9101
|
+
cb(Qcur, "Qcur", il);
|
9102
|
+
|
9103
|
+
Kcur = ggml_rope_custom(
|
9104
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9105
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9106
|
+
);
|
9107
|
+
cb(Kcur, "Kcur", il);
|
9108
|
+
|
9109
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9110
|
+
model.layers[il].wo, NULL,
|
9111
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9112
|
+
}
|
9113
|
+
|
9114
|
+
if (il == n_layer - 1) {
|
9115
|
+
// skip computing output for unused tokens
|
9116
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
9117
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9118
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
9119
|
+
}
|
9120
|
+
|
9121
|
+
cur = ggml_add(ctx0, cur, residual);
|
9122
|
+
residual = cur;
|
9123
|
+
|
9124
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9125
|
+
model.layers[il].ffn_norm, NULL,
|
9126
|
+
LLM_NORM_RMS, cb, il);
|
9127
|
+
cb(cur, "ffn_norm", il);
|
9128
|
+
|
9129
|
+
// FF
|
9130
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
9131
|
+
// TOOD: support into llm_build_ffn
|
9132
|
+
{
|
9133
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
9134
|
+
cb(up, "ffn_up", il);
|
9135
|
+
|
9136
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
9137
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
9138
|
+
|
9139
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
9140
|
+
cb(y, "ffn_gate", il);
|
9141
|
+
|
9142
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
9143
|
+
cb(down, "ffn_down", il);
|
9144
|
+
|
9145
|
+
cur = down;
|
9146
|
+
cb(cur, "ffn_out", il);
|
9147
|
+
}
|
9148
|
+
|
9149
|
+
cur = ggml_add(ctx0, residual, cur);
|
9150
|
+
cb(cur, "l_out", il);
|
9151
|
+
|
9152
|
+
inpL = cur;
|
9153
|
+
}
|
9154
|
+
|
9155
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9156
|
+
model.output_norm,
|
9157
|
+
NULL,
|
9158
|
+
LLM_NORM_RMS, cb, -1);
|
9159
|
+
cb(cur, "result_norm", -1);
|
9160
|
+
|
9161
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
9162
|
+
cb(cur, "result_output", -1);
|
8941
9163
|
|
8942
9164
|
ggml_build_forward_expand(gf, cur);
|
8943
9165
|
|
8944
9166
|
return gf;
|
8945
9167
|
}
|
8946
9168
|
|
9169
|
+
|
8947
9170
|
struct ggml_cgraph * build_plamo() {
|
8948
9171
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
8949
9172
|
|
@@ -10445,6 +10668,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10445
10668
|
{
|
10446
10669
|
result = llm.build_phi2();
|
10447
10670
|
} break;
|
10671
|
+
case LLM_ARCH_PHI3:
|
10672
|
+
{
|
10673
|
+
result = llm.build_phi3();
|
10674
|
+
} break;
|
10448
10675
|
case LLM_ARCH_PLAMO:
|
10449
10676
|
{
|
10450
10677
|
result = llm.build_plamo();
|
@@ -13268,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13268
13495
|
GGML_ASSERT(ctx);
|
13269
13496
|
const int64_t t_start_sample_us = ggml_time_us();
|
13270
13497
|
|
13271
|
-
bool
|
13498
|
+
bool allow_eog = false;
|
13272
13499
|
for (const auto & stack : grammar->stacks) {
|
13273
13500
|
if (stack.empty()) {
|
13274
|
-
|
13501
|
+
allow_eog = true;
|
13275
13502
|
break;
|
13276
13503
|
}
|
13277
13504
|
}
|
13278
13505
|
|
13279
|
-
const llama_token eos = llama_token_eos(&ctx->model);
|
13280
|
-
|
13281
13506
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
13282
13507
|
candidates_decoded.reserve(candidates->size);
|
13283
13508
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
@@ -13285,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13285
13510
|
|
13286
13511
|
for (size_t i = 0; i < candidates->size; ++i) {
|
13287
13512
|
const llama_token id = candidates->data[i].id;
|
13288
|
-
const std::string piece = llama_token_to_piece(ctx, id);
|
13289
|
-
|
13290
|
-
|
13513
|
+
const std::string piece = llama_token_to_piece(ctx, id, false);
|
13514
|
+
|
13515
|
+
if (llama_token_is_eog(&ctx->model, id)) {
|
13516
|
+
if (!allow_eog) {
|
13291
13517
|
candidates->data[i].logit = -INFINITY;
|
13292
13518
|
}
|
13293
13519
|
} else if (piece.empty() || piece[0] == 0) {
|
@@ -13450,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
13450
13676
|
return result;
|
13451
13677
|
}
|
13452
13678
|
|
13453
|
-
llama_token
|
13679
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
13454
13680
|
GGML_ASSERT(ctx);
|
13455
13681
|
|
13456
13682
|
const int64_t t_start_sample_us = ggml_time_us();
|
@@ -13463,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13463
13689
|
}
|
13464
13690
|
|
13465
13691
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
13466
|
-
auto & rng = ctx->rng;
|
13467
13692
|
int idx = dist(rng);
|
13468
13693
|
|
13469
13694
|
llama_token result = candidates->data[idx].id;
|
@@ -13473,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13473
13698
|
return result;
|
13474
13699
|
}
|
13475
13700
|
|
13701
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
13702
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
13703
|
+
}
|
13704
|
+
|
13476
13705
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
13477
13706
|
const int64_t t_start_sample_us = ggml_time_us();
|
13478
13707
|
|
13479
|
-
if (
|
13708
|
+
if (llama_token_is_eog(&ctx->model, token)) {
|
13480
13709
|
for (const auto & stack : grammar->stacks) {
|
13481
13710
|
if (stack.empty()) {
|
13482
13711
|
return;
|
@@ -13485,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
13485
13714
|
GGML_ASSERT(false);
|
13486
13715
|
}
|
13487
13716
|
|
13488
|
-
const std::string piece = llama_token_to_piece(ctx, token);
|
13717
|
+
const std::string piece = llama_token_to_piece(ctx, token, false);
|
13489
13718
|
|
13490
13719
|
// Note terminating 0 in decoded string
|
13491
13720
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14308,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14308
14537
|
std::vector<no_init<uint8_t>> work;
|
14309
14538
|
std::vector<no_init<float>> f32_conv_buf;
|
14310
14539
|
|
14540
|
+
uint16_t n_split = 1;
|
14541
|
+
// Assume split index is continuous
|
14542
|
+
if (params->keep_split) {
|
14543
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
14544
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
14545
|
+
}
|
14546
|
+
}
|
14547
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
14548
|
+
ctx_outs[0] = ctx_out;
|
14549
|
+
|
14311
14550
|
// populate the original tensors so we get an initial meta data
|
14312
14551
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14313
|
-
|
14314
|
-
|
14552
|
+
auto weight = ml.get_weight(i);
|
14553
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
14554
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14555
|
+
if (ctx_outs[i_split] == NULL) {
|
14556
|
+
ctx_outs[i_split] = gguf_init_empty();
|
14557
|
+
}
|
14558
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
14315
14559
|
}
|
14316
14560
|
|
14317
|
-
|
14318
|
-
|
14319
|
-
|
14320
|
-
|
14561
|
+
// Set split info if needed
|
14562
|
+
if (n_split > 1) {
|
14563
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
14564
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
14565
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
14566
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
14567
|
+
}
|
14568
|
+
}
|
14321
14569
|
|
14322
|
-
|
14570
|
+
int cur_split = -1;
|
14571
|
+
std::ofstream fout;
|
14572
|
+
auto close_ofstream = [&]() {
|
14573
|
+
// Write metadata and close file handler
|
14574
|
+
if (fout.is_open()) {
|
14575
|
+
fout.seekp(0);
|
14576
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
14577
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
14578
|
+
fout.write((const char *) data.data(), data.size());
|
14579
|
+
fout.close();
|
14580
|
+
}
|
14581
|
+
};
|
14582
|
+
auto new_ofstream = [&](int index) {
|
14583
|
+
cur_split = index;
|
14584
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
14585
|
+
std::string fname = fname_out;
|
14586
|
+
if (params->keep_split) {
|
14587
|
+
char split_path[PATH_MAX] = {0};
|
14588
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
14589
|
+
fname = std::string(split_path);
|
14590
|
+
}
|
14323
14591
|
|
14324
|
-
|
14325
|
-
|
14592
|
+
fout = std::ofstream(fname, std::ios::binary);
|
14593
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
14594
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
14595
|
+
// placeholder for the meta data
|
14596
|
+
::zeros(fout, meta_size);
|
14597
|
+
};
|
14326
14598
|
|
14327
14599
|
const auto tn = LLM_TN(model.arch);
|
14328
|
-
|
14600
|
+
new_ofstream(0);
|
14329
14601
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14330
|
-
|
14602
|
+
auto weight = ml.get_weight(i);
|
14603
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14604
|
+
if (weight->idx != cur_split && params->keep_split) {
|
14605
|
+
close_ofstream();
|
14606
|
+
new_ofstream(weight->idx);
|
14607
|
+
}
|
14331
14608
|
|
14332
14609
|
const std::string name = ggml_get_name(tensor);
|
14333
14610
|
|
@@ -14482,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14482
14759
|
total_size_new += new_size;
|
14483
14760
|
|
14484
14761
|
// update the gguf meta data as we go
|
14485
|
-
gguf_set_tensor_type(
|
14486
|
-
gguf_set_tensor_data(
|
14762
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
14763
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
14487
14764
|
|
14488
14765
|
// write tensor data + padding
|
14489
14766
|
fout.write((const char *) new_data, new_size);
|
14490
14767
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
14491
14768
|
}
|
14492
|
-
|
14493
|
-
|
14494
|
-
|
14495
|
-
fout.seekp(0);
|
14496
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
14497
|
-
gguf_get_meta_data(ctx_out, data.data());
|
14498
|
-
fout.write((const char *) data.data(), data.size());
|
14769
|
+
close_ofstream();
|
14770
|
+
for (auto & c:ctx_outs) {
|
14771
|
+
gguf_free(c);
|
14499
14772
|
}
|
14500
14773
|
|
14501
|
-
fout.close();
|
14502
|
-
|
14503
|
-
gguf_free(ctx_out);
|
14504
|
-
|
14505
14774
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
14506
14775
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
14507
14776
|
|
@@ -14857,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
14857
15126
|
/*.quantize_output_tensor =*/ true,
|
14858
15127
|
/*.only_copy =*/ false,
|
14859
15128
|
/*.pure =*/ false,
|
15129
|
+
/*.keep_split =*/ false,
|
14860
15130
|
/*.imatrix =*/ nullptr,
|
14861
15131
|
/*.kv_overrides =*/ nullptr,
|
14862
15132
|
};
|
@@ -15365,6 +15635,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15365
15635
|
case LLM_ARCH_QWEN2:
|
15366
15636
|
case LLM_ARCH_QWEN2MOE:
|
15367
15637
|
case LLM_ARCH_PHI2:
|
15638
|
+
case LLM_ARCH_PHI3:
|
15368
15639
|
case LLM_ARCH_GEMMA:
|
15369
15640
|
case LLM_ARCH_STARCODER2:
|
15370
15641
|
return LLAMA_ROPE_TYPE_NEOX;
|
@@ -15378,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15378
15649
|
return LLAMA_ROPE_TYPE_NONE;
|
15379
15650
|
}
|
15380
15651
|
|
15652
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
15653
|
+
return ctx->cparams.pooling_type;
|
15654
|
+
}
|
15655
|
+
|
15381
15656
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
15382
15657
|
return model->hparams.n_vocab;
|
15383
15658
|
}
|
@@ -15856,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
|
|
15856
16131
|
*
|
15857
16132
|
*/
|
15858
16133
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
16134
|
+
llama_synchronize(ctx);
|
16135
|
+
|
15859
16136
|
// copy rng
|
15860
16137
|
{
|
15861
16138
|
std::ostringstream rng_ss;
|
@@ -16008,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
16008
16285
|
|
16009
16286
|
// Sets the state reading from the specified source address
|
16010
16287
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
16288
|
+
llama_synchronize(ctx);
|
16289
|
+
|
16011
16290
|
const uint8_t * inp = src;
|
16012
16291
|
|
16013
16292
|
// set rng
|
@@ -16312,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
16312
16591
|
}
|
16313
16592
|
|
16314
16593
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16594
|
+
llama_synchronize(ctx);
|
16595
|
+
|
16315
16596
|
const auto & kv_self = ctx->kv_self;
|
16316
16597
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16317
16598
|
|
@@ -16429,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
16429
16710
|
}
|
16430
16711
|
|
16431
16712
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16713
|
+
llama_synchronize(ctx);
|
16714
|
+
|
16432
16715
|
auto & kv_self = ctx->kv_self;
|
16433
16716
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16434
16717
|
|
@@ -16880,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|
16880
17163
|
return model->vocab.id_to_token[token].type;
|
16881
17164
|
}
|
16882
17165
|
|
17166
|
+
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
17167
|
+
return token != -1 && (
|
17168
|
+
token == llama_token_eos(model) ||
|
17169
|
+
token == llama_token_eot(model)
|
17170
|
+
);
|
17171
|
+
}
|
17172
|
+
|
16883
17173
|
llama_token llama_token_bos(const struct llama_model * model) {
|
16884
17174
|
return model->vocab.special_bos_id;
|
16885
17175
|
}
|
@@ -16957,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
16957
17247
|
}
|
16958
17248
|
|
16959
17249
|
// does not write null-terminator to buf
|
16960
|
-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
17250
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
16961
17251
|
if (0 <= token && token < llama_n_vocab(model)) {
|
16962
17252
|
switch (llama_vocab_get_type(model->vocab)) {
|
16963
17253
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -16972,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16972
17262
|
}
|
16973
17263
|
memcpy(buf, result.c_str(), result.length());
|
16974
17264
|
return result.length();
|
16975
|
-
} else if (
|
17265
|
+
} else if (
|
17266
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17267
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16976
17268
|
std::string result = model->vocab.id_to_token[token].text;
|
16977
17269
|
if (length < (int) result.length()) {
|
16978
17270
|
return -(int) result.length();
|
@@ -16985,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16985
17277
|
}
|
16986
17278
|
memcpy(buf, "\xe2\x96\x85", 3);
|
16987
17279
|
return 3;
|
16988
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16989
|
-
;
|
16990
17280
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
16991
17281
|
if (length < 1) {
|
16992
17282
|
return -1;
|
@@ -17007,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
17007
17297
|
}
|
17008
17298
|
memcpy(buf, result.c_str(), result.length());
|
17009
17299
|
return result.length();
|
17010
|
-
} else if (
|
17300
|
+
} else if (
|
17301
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17302
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
17011
17303
|
std::string result = model->vocab.id_to_token[token].text;
|
17012
17304
|
if (length < (int) result.length()) {
|
17013
17305
|
return -(int) result.length();
|
17014
17306
|
}
|
17015
17307
|
memcpy(buf, result.c_str(), result.length());
|
17016
17308
|
return result.length();
|
17017
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
17018
|
-
;
|
17019
17309
|
}
|
17020
17310
|
break;
|
17021
17311
|
}
|
@@ -17213,6 +17503,24 @@ static int32_t llama_chat_apply_template_internal(
|
|
17213
17503
|
if (add_ass) {
|
17214
17504
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
17505
|
}
|
17506
|
+
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
17507
|
+
// Llama 3
|
17508
|
+
for (auto message : chat) {
|
17509
|
+
std::string role(message->role);
|
17510
|
+
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
|
17511
|
+
}
|
17512
|
+
if (add_ass) {
|
17513
|
+
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17514
|
+
}
|
17515
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17516
|
+
// Phi 3
|
17517
|
+
for (auto message : chat) {
|
17518
|
+
std::string role(message->role);
|
17519
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17520
|
+
}
|
17521
|
+
if (add_ass) {
|
17522
|
+
ss << "<|assistant|>\n";
|
17523
|
+
}
|
17216
17524
|
} else {
|
17217
17525
|
// template not supported
|
17218
17526
|
return -1;
|
@@ -17345,6 +17653,11 @@ const char * llama_print_system_info(void) {
|
|
17345
17653
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
17346
17654
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17347
17655
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
|
+
#ifdef GGML_USE_LLAMAFILE
|
17657
|
+
s += "LAMMAFILE = 1 | ";
|
17658
|
+
#else
|
17659
|
+
s += "LAMMAFILE = 0 | ";
|
17660
|
+
#endif
|
17348
17661
|
|
17349
17662
|
return s.c_str();
|
17350
17663
|
}
|