cui-llama.rn 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/common.cpp +0 -6
- package/cpp/common.h +0 -4
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml.c +3 -3
- package/cpp/ggml.h +4 -2
- package/cpp/grammar-parser.cpp +3 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +4 -2
- package/cpp/llama-vocab.h +2 -2
- package/cpp/llama.cpp +34 -15
- package/cpp/llama.h +6 -10
- package/cpp/rn-llama.hpp +20 -6
- package/package.json +1 -1
package/cpp/common.cpp
CHANGED
@@ -2709,12 +2709,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
2709
2709
|
return text;
|
2710
2710
|
}
|
2711
2711
|
|
2712
|
-
bool llama_should_add_bos_token(const llama_model * model) {
|
2713
|
-
const int add_bos = llama_add_bos_token(model);
|
2714
|
-
|
2715
|
-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
2716
|
-
}
|
2717
|
-
|
2718
2712
|
//
|
2719
2713
|
// Chat template utils
|
2720
2714
|
//
|
package/cpp/common.h
CHANGED
@@ -392,10 +392,6 @@ std::string llama_detokenize(
|
|
392
392
|
const std::vector<llama_token> & tokens,
|
393
393
|
bool special = true);
|
394
394
|
|
395
|
-
// Uses the value from the model metadata if possible, otherwise
|
396
|
-
// defaults to true when model type is SPM, otherwise false.
|
397
|
-
bool llama_should_add_bos_token(const llama_model * model);
|
398
|
-
|
399
395
|
//
|
400
396
|
// Chat template utils
|
401
397
|
//
|
package/cpp/ggml-metal.m
CHANGED
@@ -310,7 +310,7 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
310
310
|
LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
311
311
|
|
312
312
|
// Configure context
|
313
|
-
struct lm_ggml_backend_metal_context * ctx =
|
313
|
+
struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context));
|
314
314
|
ctx->device = device;
|
315
315
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
316
316
|
ctx->queue = [ctx->device newCommandQueue];
|
@@ -2313,7 +2313,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2313
2313
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
2314
2314
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
2315
2315
|
|
2316
|
-
const bool is_neox = mode &
|
2316
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
2317
2317
|
|
2318
2318
|
id<MTLComputePipelineState> pipeline = nil;
|
2319
2319
|
|
package/cpp/ggml.c
CHANGED
@@ -14094,7 +14094,7 @@ static void lm_ggml_compute_forward_rope_f32(
|
|
14094
14094
|
float corr_dims[2];
|
14095
14095
|
lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14096
14096
|
|
14097
|
-
const bool is_neox = mode &
|
14097
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
14098
14098
|
|
14099
14099
|
const float * freq_factors = NULL;
|
14100
14100
|
if (src2 != NULL) {
|
@@ -14219,7 +14219,7 @@ static void lm_ggml_compute_forward_rope_f16(
|
|
14219
14219
|
float corr_dims[2];
|
14220
14220
|
lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14221
14221
|
|
14222
|
-
const bool is_neox = mode &
|
14222
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
14223
14223
|
|
14224
14224
|
const float * freq_factors = NULL;
|
14225
14225
|
if (src2 != NULL) {
|
@@ -21129,7 +21129,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
|
|
21129
21129
|
(int64_t) info->ne[2] *
|
21130
21130
|
(int64_t) info->ne[3];
|
21131
21131
|
|
21132
|
-
if (ne % lm_ggml_blck_size(info->type) != 0) {
|
21132
|
+
if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
|
21133
21133
|
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
21134
21134
|
__func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
|
21135
21135
|
fclose(file);
|
package/cpp/ggml.h
CHANGED
@@ -244,6 +244,8 @@
|
|
244
244
|
#define LM_GGML_EXIT_SUCCESS 0
|
245
245
|
#define LM_GGML_EXIT_ABORTED 1
|
246
246
|
|
247
|
+
#define LM_GGML_ROPE_TYPE_NEOX 2
|
248
|
+
|
247
249
|
#define LM_GGUF_MAGIC "GGUF"
|
248
250
|
|
249
251
|
#define LM_GGUF_VERSION 3
|
@@ -1453,8 +1455,8 @@ extern "C" {
|
|
1453
1455
|
struct lm_ggml_tensor * b);
|
1454
1456
|
|
1455
1457
|
// rotary position embedding
|
1456
|
-
// if mode & 1
|
1457
|
-
// if mode &
|
1458
|
+
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
|
1459
|
+
// if (mode & LM_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
|
1458
1460
|
//
|
1459
1461
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1460
1462
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
|
package/cpp/grammar-parser.cpp
CHANGED
@@ -369,6 +369,9 @@ namespace grammar_parser {
|
|
369
369
|
}
|
370
370
|
// Validate the state to ensure that all rules are defined
|
371
371
|
for (const auto & rule : state.rules) {
|
372
|
+
if (rule.empty()) {
|
373
|
+
throw std::runtime_error("Undefined rule");
|
374
|
+
}
|
372
375
|
for (const auto & elem : rule) {
|
373
376
|
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
374
377
|
// Ensure that the rule at that location exists
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
|
|
85
85
|
constexpr float bucket_low = -10.0f;
|
86
86
|
constexpr float bucket_high = 10.0f;
|
87
87
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
88
|
-
constexpr float
|
88
|
+
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
89
89
|
|
90
90
|
std::vector<int> bucket_idx(candidates->size);
|
91
91
|
std::vector<int> histo(nbuckets, 0);
|
92
92
|
|
93
93
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
94
94
|
const float val = candidates->data[i].logit;
|
95
|
-
int ib = int(bucket_scale * val +
|
95
|
+
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
96
96
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
97
97
|
bucket_idx[i] = ib;
|
98
98
|
++histo[ib];
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -410,6 +410,8 @@ struct llm_tokenizer_bpe {
|
|
410
410
|
};
|
411
411
|
break;
|
412
412
|
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
413
|
+
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
414
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
413
415
|
regex_exprs = {
|
414
416
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
415
417
|
};
|
@@ -1466,11 +1468,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
|
|
1466
1468
|
return vocab.special_pad_id;
|
1467
1469
|
}
|
1468
1470
|
|
1469
|
-
|
1471
|
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
1470
1472
|
return vocab.tokenizer_add_bos;
|
1471
1473
|
}
|
1472
1474
|
|
1473
|
-
|
1475
|
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
1474
1476
|
return vocab.tokenizer_add_eos;
|
1475
1477
|
}
|
1476
1478
|
|
package/cpp/llama-vocab.h
CHANGED
@@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
|
95
95
|
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
96
96
|
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
97
97
|
|
98
|
-
|
99
|
-
|
98
|
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
99
|
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
100
100
|
|
101
101
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
102
102
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
package/cpp/llama.cpp
CHANGED
@@ -3586,13 +3586,8 @@ namespace GGUFMeta {
|
|
3586
3586
|
|
3587
3587
|
using llama_buf_map = std::unordered_map<uint32_t, lm_ggml_backend_buffer_t>;
|
3588
3588
|
|
3589
|
-
|
3590
|
-
|
3591
|
-
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
|
3592
|
-
// return 32768;
|
3593
|
-
//}
|
3594
|
-
|
3595
|
-
return 8192;
|
3589
|
+
static size_t llama_model_max_nodes(const llama_model & model) {
|
3590
|
+
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
|
3596
3591
|
}
|
3597
3592
|
|
3598
3593
|
struct llama_model_loader {
|
@@ -4912,7 +4907,6 @@ static void llm_load_hparams(
|
|
4912
4907
|
} break;
|
4913
4908
|
case LLM_ARCH_PHI3:
|
4914
4909
|
{
|
4915
|
-
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
4916
4910
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4917
4911
|
|
4918
4912
|
switch (hparams.n_layer) {
|
@@ -4921,6 +4915,22 @@ static void llm_load_hparams(
|
|
4921
4915
|
case 40: model.type = e_model::MODEL_14B; break;
|
4922
4916
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4923
4917
|
}
|
4918
|
+
|
4919
|
+
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
4920
|
+
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
4921
|
+
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
4922
|
+
hparams.n_swa = 2047;
|
4923
|
+
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
4924
|
+
// default value for Phi-3-mini-128k-instruct
|
4925
|
+
hparams.n_swa = 262144;
|
4926
|
+
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
4927
|
+
// default value for Phi-3-medium-128k-instruct
|
4928
|
+
hparams.n_swa = 131072;
|
4929
|
+
}
|
4930
|
+
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
4931
|
+
if (!found_swa && hparams.n_swa == 0) {
|
4932
|
+
throw std::runtime_error("invalid value for sliding_window");
|
4933
|
+
}
|
4924
4934
|
} break;
|
4925
4935
|
case LLM_ARCH_PLAMO:
|
4926
4936
|
{
|
@@ -5468,6 +5478,12 @@ static void llm_load_vocab(
|
|
5468
5478
|
} else if (
|
5469
5479
|
tokenizer_pre == "codeshell") {
|
5470
5480
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
5481
|
+
} else if (
|
5482
|
+
tokenizer_pre == "bloom") {
|
5483
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
5484
|
+
} else if (
|
5485
|
+
tokenizer_pre == "gpt3-finnish") {
|
5486
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
5471
5487
|
} else {
|
5472
5488
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
5473
5489
|
}
|
@@ -14718,12 +14734,15 @@ static int llama_decode_internal(
|
|
14718
14734
|
res = nullptr;
|
14719
14735
|
embd = nullptr;
|
14720
14736
|
} else if (cparams.embeddings) {
|
14721
|
-
res
|
14722
|
-
embd =
|
14723
|
-
|
14724
|
-
|
14737
|
+
res = nullptr; // do not extract logits for embedding case
|
14738
|
+
embd = nullptr;
|
14739
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
14740
|
+
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
14741
|
+
embd = gf->nodes[i];
|
14742
|
+
break;
|
14743
|
+
}
|
14725
14744
|
}
|
14726
|
-
LM_GGML_ASSERT(
|
14745
|
+
LM_GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
|
14727
14746
|
} else {
|
14728
14747
|
embd = nullptr; // do not extract embeddings when not needed
|
14729
14748
|
LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
@@ -18697,11 +18716,11 @@ llama_token llama_token_pad(const struct llama_model * model) {
|
|
18697
18716
|
return llama_token_pad_impl(model->vocab);
|
18698
18717
|
}
|
18699
18718
|
|
18700
|
-
|
18719
|
+
bool llama_add_bos_token(const struct llama_model * model) {
|
18701
18720
|
return llama_add_bos_token_impl(model->vocab);
|
18702
18721
|
}
|
18703
18722
|
|
18704
|
-
|
18723
|
+
bool llama_add_eos_token(const struct llama_model * model) {
|
18705
18724
|
return llama_add_eos_token_impl(model->vocab);
|
18706
18725
|
}
|
18707
18726
|
|
package/cpp/llama.h
CHANGED
@@ -93,15 +93,14 @@ extern "C" {
|
|
93
93
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
94
94
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
95
95
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
96
|
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
97
|
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
96
98
|
};
|
97
99
|
|
98
|
-
// note: these values should be synchronized with lm_ggml_rope
|
99
|
-
// TODO: maybe move this enum to ggml.h (lm_ggml_rope_type)
|
100
100
|
enum llama_rope_type {
|
101
101
|
LLAMA_ROPE_TYPE_NONE = -1,
|
102
|
-
LLAMA_ROPE_TYPE_NORM =
|
103
|
-
LLAMA_ROPE_TYPE_NEOX =
|
104
|
-
LLAMA_ROPE_TYPE_GLM = 4,
|
102
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
103
|
+
LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
|
105
104
|
};
|
106
105
|
|
107
106
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
@@ -915,11 +914,8 @@ extern "C" {
|
|
915
914
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
916
915
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
917
916
|
|
918
|
-
|
919
|
-
LLAMA_API
|
920
|
-
|
921
|
-
// Returns -1 if unknown, 1 for true or 0 for false.
|
922
|
-
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
917
|
+
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
918
|
+
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
923
919
|
|
924
920
|
// Codellama infill tokens
|
925
921
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
package/cpp/rn-llama.hpp
CHANGED
@@ -297,7 +297,9 @@ struct llama_rn_context
|
|
297
297
|
}
|
298
298
|
|
299
299
|
// do Context Shift , may be buggy! TODO: Verify functionality
|
300
|
-
|
300
|
+
if(!params.embedding){
|
301
|
+
purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
|
302
|
+
}
|
301
303
|
|
302
304
|
// push the prompt into the sampling context (do not apply grammar)
|
303
305
|
for (auto & token : prompt_tokens)
|
@@ -305,7 +307,7 @@ struct llama_rn_context
|
|
305
307
|
llama_sampling_accept(ctx_sampling, ctx, token, false);
|
306
308
|
}
|
307
309
|
// compare the evaluated prompt with the new prompt
|
308
|
-
n_past = common_part(embd, prompt_tokens);
|
310
|
+
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
309
311
|
LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
|
310
312
|
LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
|
311
313
|
LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
|
@@ -342,9 +344,9 @@ struct llama_rn_context
|
|
342
344
|
completion_token_output result;
|
343
345
|
result.tok = -1;
|
344
346
|
|
347
|
+
// this truncation should never trigger with good context shifting
|
345
348
|
if (embd.size() >= (size_t)params.n_ctx)
|
346
349
|
{
|
347
|
-
// Shift context
|
348
350
|
|
349
351
|
const int n_left = n_past - params.n_keep - 1;
|
350
352
|
const int n_discard = n_left/2;
|
@@ -546,9 +548,21 @@ struct llama_rn_context
|
|
546
548
|
LOG_WARNING("embedding disabled, embedding: %s", params.embedding);
|
547
549
|
return std::vector<float>(n_embd, 0.0f);
|
548
550
|
}
|
549
|
-
|
550
|
-
|
551
|
-
|
551
|
+
float *data;
|
552
|
+
|
553
|
+
if(params.pooling_type == 0){
|
554
|
+
data = llama_get_embeddings(ctx);
|
555
|
+
}
|
556
|
+
else {
|
557
|
+
data = llama_get_embeddings_seq(ctx, 0);
|
558
|
+
}
|
559
|
+
|
560
|
+
if(!data) {
|
561
|
+
return std::vector<float>(n_embd, 0.0f);
|
562
|
+
}
|
563
|
+
std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
|
564
|
+
llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
|
565
|
+
return out;
|
552
566
|
}
|
553
567
|
|
554
568
|
std::string bench(int pp, int tg, int pl, int nr)
|