llama_cpp 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +81 -162
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +13 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/ggml.c +362 -84
- data/ext/llama_cpp/src/ggml.h +8 -7
- data/ext/llama_cpp/src/llama.cpp +100 -95
- data/ext/llama_cpp/src/llama.h +16 -21
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +11 -12
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -401,15 +401,16 @@ extern "C" {
|
|
401
401
|
GGML_OP_ALIBI,
|
402
402
|
GGML_OP_CLAMP,
|
403
403
|
GGML_OP_CONV_1D,
|
404
|
-
|
404
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
405
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
+
GGML_OP_CONV_2D,
|
408
|
+
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
+
GGML_OP_CONV_2D_STAGE_1, // internal
|
406
410
|
GGML_OP_CONV_TRANSPOSE_2D,
|
407
411
|
GGML_OP_POOL_1D,
|
408
412
|
GGML_OP_POOL_2D,
|
409
413
|
|
410
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
411
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
412
|
-
|
413
414
|
GGML_OP_UPSCALE, // nearest interpolate
|
414
415
|
|
415
416
|
GGML_OP_FLASH_ATTN,
|
@@ -1020,9 +1021,9 @@ extern "C" {
|
|
1020
1021
|
struct ggml_tensor * b,
|
1021
1022
|
float eps);
|
1022
1023
|
|
1023
|
-
// A:
|
1024
|
-
// B:
|
1025
|
-
// result is
|
1024
|
+
// A: k columns, n rows => [ne03, ne02, n, k]
|
1025
|
+
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
1026
|
+
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
1026
1027
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
1027
1028
|
struct ggml_context * ctx,
|
1028
1029
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -970,14 +970,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
970
970
|
(void) tensor;
|
971
971
|
}
|
972
972
|
|
973
|
-
static std::string
|
973
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
974
974
|
std::vector<char> result(8, 0);
|
975
975
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
976
976
|
if (n_tokens < 0) {
|
977
977
|
result.resize(-n_tokens);
|
978
978
|
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
979
979
|
GGML_ASSERT(check == -n_tokens);
|
980
|
-
}
|
980
|
+
}
|
981
|
+
else {
|
981
982
|
result.resize(n_tokens);
|
982
983
|
}
|
983
984
|
|
@@ -1013,8 +1014,8 @@ enum e_model {
|
|
1013
1014
|
};
|
1014
1015
|
|
1015
1016
|
static const size_t kB = 1024;
|
1016
|
-
static const size_t MB =
|
1017
|
-
static const size_t GB =
|
1017
|
+
static const size_t MB = 1024*kB;
|
1018
|
+
static const size_t GB = 1024*MB;
|
1018
1019
|
|
1019
1020
|
struct llama_hparams {
|
1020
1021
|
bool vocab_only;
|
@@ -1037,21 +1038,21 @@ struct llama_hparams {
|
|
1037
1038
|
float f_max_alibi_bias;
|
1038
1039
|
|
1039
1040
|
bool operator!=(const llama_hparams & other) const {
|
1040
|
-
if (this->vocab_only
|
1041
|
-
if (this->n_vocab
|
1041
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1042
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1042
1043
|
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1043
|
-
if (this->n_embd
|
1044
|
-
if (this->n_head
|
1045
|
-
if (this->n_head_kv
|
1046
|
-
if (this->n_layer
|
1047
|
-
if (this->n_rot
|
1048
|
-
if (this->n_ff
|
1044
|
+
if (this->n_embd != other.n_embd) return true;
|
1045
|
+
if (this->n_head != other.n_head) return true;
|
1046
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1047
|
+
if (this->n_layer != other.n_layer) return true;
|
1048
|
+
if (this->n_rot != other.n_rot) return true;
|
1049
|
+
if (this->n_ff != other.n_ff) return true;
|
1049
1050
|
|
1050
1051
|
const float EPSILON = 1e-9;
|
1051
1052
|
|
1052
|
-
if (!is_float_close(this->f_norm_eps,
|
1053
|
-
if (!is_float_close(this->f_norm_rms_eps,
|
1054
|
-
if (!is_float_close(this->rope_freq_base_train,
|
1053
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1054
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1055
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1055
1056
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1056
1057
|
|
1057
1058
|
return false;
|
@@ -1190,17 +1191,17 @@ struct llama_vocab {
|
|
1190
1191
|
id special_sep_id = -1;
|
1191
1192
|
id special_pad_id = -1;
|
1192
1193
|
|
1193
|
-
id linefeed_id
|
1194
|
+
id linefeed_id = 13;
|
1194
1195
|
id special_prefix_id = 32007;
|
1195
1196
|
id special_middle_id = 32009;
|
1196
1197
|
id special_suffix_id = 32008;
|
1197
|
-
id special_eot_id
|
1198
|
+
id special_eot_id = 32010;
|
1198
1199
|
|
1199
1200
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1201
|
+
GGML_ASSERT(token_left.find(" ") == std::string::npos);
|
1202
|
+
GGML_ASSERT(token_left.find("\n") == std::string::npos);
|
1203
|
+
GGML_ASSERT(token_right.find(" ") == std::string::npos);
|
1204
|
+
GGML_ASSERT(token_right.find("\n") == std::string::npos);
|
1204
1205
|
|
1205
1206
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
1206
1207
|
if (it == bpe_ranks.end()) {
|
@@ -1354,10 +1355,7 @@ static bool llama_kv_cache_init(
|
|
1354
1355
|
cache.cells.clear();
|
1355
1356
|
cache.cells.resize(n_ctx);
|
1356
1357
|
|
1357
|
-
|
1358
|
-
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1359
|
-
// change it and test that it works
|
1360
|
-
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1361
1359
|
memset(cache.buf.data, 0, cache.buf.size);
|
1362
1360
|
|
1363
1361
|
struct ggml_init_params params;
|
@@ -2236,15 +2234,35 @@ static void llm_load_vocab(
|
|
2236
2234
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2237
2235
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2238
2236
|
} else {
|
2239
|
-
|
2237
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
2238
|
+
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
2239
|
+
vocab.linefeed_id = ids[0];
|
2240
2240
|
}
|
2241
2241
|
|
2242
2242
|
// special tokens
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2243
|
+
{
|
2244
|
+
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
2245
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
2246
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
2247
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
2248
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
2249
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
2250
|
+
};
|
2251
|
+
for (const auto & it : special_token_types) {
|
2252
|
+
const std::string & key = kv(std::get<0>(it));
|
2253
|
+
int32_t & id = std::get<1>(it), old_id = id;
|
2254
|
+
|
2255
|
+
GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
|
2256
|
+
// Must be >= -1 and < vocab size. Since the key is unsigned, -1
|
2257
|
+
// can only come from the default value, so there's no point in
|
2258
|
+
// validating that.
|
2259
|
+
if (size_t(id + 1) > vocab.id_to_token.size()) {
|
2260
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
|
2261
|
+
__func__, key.c_str(), id, old_id);
|
2262
|
+
id = old_id;
|
2263
|
+
}
|
2264
|
+
}
|
2265
|
+
}
|
2248
2266
|
|
2249
2267
|
// build special tokens cache
|
2250
2268
|
{
|
@@ -6101,11 +6119,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
6101
6119
|
}
|
6102
6120
|
|
6103
6121
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
6122
|
+
static const char * hex = "0123456789ABCDEF";
|
6104
6123
|
switch (llama_vocab_get_type(vocab)) {
|
6105
6124
|
case LLAMA_VOCAB_TYPE_SPM: {
|
6106
|
-
char buf[7];
|
6107
|
-
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
6108
|
-
GGML_ASSERT(0 <= result && result < 7);
|
6125
|
+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
6109
6126
|
return vocab.token_to_id.at(buf);
|
6110
6127
|
}
|
6111
6128
|
case LLAMA_VOCAB_TYPE_BPE: {
|
@@ -7412,37 +7429,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
7412
7429
|
llama_sample_temp(ctx, candidates_p, temp);
|
7413
7430
|
}
|
7414
7431
|
|
7415
|
-
void
|
7416
|
-
|
7417
|
-
|
7418
|
-
|
7419
|
-
|
7420
|
-
|
7421
|
-
|
7422
|
-
|
7423
|
-
|
7424
|
-
if (token_iter == last_tokens + last_tokens_size) {
|
7425
|
-
continue;
|
7426
|
-
}
|
7427
|
-
|
7428
|
-
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
7429
|
-
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
7430
|
-
if (candidates->data[i].logit <= 0) {
|
7431
|
-
candidates->data[i].logit *= penalty;
|
7432
|
-
} else {
|
7433
|
-
candidates->data[i].logit /= penalty;
|
7434
|
-
}
|
7435
|
-
}
|
7436
|
-
|
7437
|
-
candidates->sorted = false;
|
7438
|
-
|
7439
|
-
if (ctx) {
|
7440
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
7441
|
-
}
|
7442
|
-
}
|
7443
|
-
|
7444
|
-
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
7445
|
-
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
7432
|
+
void llama_sample_repetition_penalties(
|
7433
|
+
struct llama_context * ctx,
|
7434
|
+
llama_token_data_array * candidates,
|
7435
|
+
const llama_token * last_tokens,
|
7436
|
+
size_t penalty_last_n,
|
7437
|
+
float penalty_repeat,
|
7438
|
+
float penalty_freq,
|
7439
|
+
float penalty_present) {
|
7440
|
+
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
|
7446
7441
|
return;
|
7447
7442
|
}
|
7448
7443
|
|
@@ -7450,19 +7445,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
7450
7445
|
|
7451
7446
|
// Create a frequency map to count occurrences of each token in last_tokens
|
7452
7447
|
std::unordered_map<llama_token, int> token_count;
|
7453
|
-
for (size_t i = 0; i <
|
7454
|
-
token_count[
|
7448
|
+
for (size_t i = 0; i < penalty_last_n; ++i) {
|
7449
|
+
token_count[last_tokens[i]]++;
|
7455
7450
|
}
|
7456
7451
|
|
7457
7452
|
// Apply frequency and presence penalties to the candidates
|
7458
7453
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7459
|
-
auto token_iter = token_count.find(candidates->data[i].id);
|
7454
|
+
const auto token_iter = token_count.find(candidates->data[i].id);
|
7460
7455
|
if (token_iter == token_count.end()) {
|
7461
7456
|
continue;
|
7462
7457
|
}
|
7463
7458
|
|
7464
|
-
int count = token_iter->second;
|
7465
|
-
|
7459
|
+
const int count = token_iter->second;
|
7460
|
+
|
7461
|
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
7462
|
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
7463
|
+
if (candidates->data[i].logit <= 0) {
|
7464
|
+
candidates->data[i].logit *= penalty_repeat;
|
7465
|
+
} else {
|
7466
|
+
candidates->data[i].logit /= penalty_repeat;
|
7467
|
+
}
|
7468
|
+
|
7469
|
+
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
7466
7470
|
}
|
7467
7471
|
|
7468
7472
|
candidates->sorted = false;
|
@@ -7484,14 +7488,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7484
7488
|
}
|
7485
7489
|
}
|
7486
7490
|
|
7487
|
-
const llama_token eos = llama_token_eos(ctx);
|
7491
|
+
const llama_token eos = llama_token_eos(&ctx->model);
|
7488
7492
|
|
7489
7493
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7490
7494
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7491
7495
|
|
7492
7496
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7493
7497
|
const llama_token id = candidates->data[i].id;
|
7494
|
-
const std::string piece =
|
7498
|
+
const std::string piece = llama_token_to_piece(ctx, id);
|
7495
7499
|
if (id == eos) {
|
7496
7500
|
if (!allow_eos) {
|
7497
7501
|
candidates->data[i].logit = -INFINITY;
|
@@ -7694,7 +7698,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
7694
7698
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
7695
7699
|
const int64_t t_start_sample_us = ggml_time_us();
|
7696
7700
|
|
7697
|
-
if (token == llama_token_eos(ctx)) {
|
7701
|
+
if (token == llama_token_eos(&ctx->model)) {
|
7698
7702
|
for (const auto & stack : grammar->stacks) {
|
7699
7703
|
if (stack.empty()) {
|
7700
7704
|
return;
|
@@ -7703,7 +7707,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7703
7707
|
GGML_ASSERT(false);
|
7704
7708
|
}
|
7705
7709
|
|
7706
|
-
const std::string piece =
|
7710
|
+
const std::string piece = llama_token_to_piece(ctx, token);
|
7707
7711
|
|
7708
7712
|
// Note terminating 0 in decoded string
|
7709
7713
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
@@ -8903,7 +8907,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8903
8907
|
// build worst-case graph
|
8904
8908
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
8905
8909
|
int n_past = cparams.n_ctx - n_tokens;
|
8906
|
-
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8910
|
+
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8907
8911
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
8908
8912
|
|
8909
8913
|
#ifdef GGML_USE_METAL
|
@@ -9664,43 +9668,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
9664
9668
|
return ctx->embedding.data();
|
9665
9669
|
}
|
9666
9670
|
|
9667
|
-
const char * llama_token_get_text(const struct
|
9668
|
-
return
|
9671
|
+
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
9672
|
+
return model->vocab.id_to_token[token].text.c_str();
|
9669
9673
|
}
|
9670
9674
|
|
9671
|
-
float llama_token_get_score(const struct
|
9672
|
-
return
|
9675
|
+
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
9676
|
+
return model->vocab.id_to_token[token].score;
|
9673
9677
|
}
|
9674
9678
|
|
9675
|
-
llama_token_type llama_token_get_type(const struct
|
9676
|
-
return
|
9679
|
+
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
9680
|
+
return model->vocab.id_to_token[token].type;
|
9677
9681
|
}
|
9678
9682
|
|
9679
|
-
llama_token llama_token_bos(const struct
|
9680
|
-
return
|
9683
|
+
llama_token llama_token_bos(const struct llama_model * model) {
|
9684
|
+
return model->vocab.special_bos_id;
|
9681
9685
|
}
|
9682
9686
|
|
9683
|
-
llama_token llama_token_eos(const struct
|
9684
|
-
return
|
9687
|
+
llama_token llama_token_eos(const struct llama_model * model) {
|
9688
|
+
return model->vocab.special_eos_id;
|
9685
9689
|
}
|
9686
9690
|
|
9687
|
-
llama_token llama_token_nl(const struct
|
9688
|
-
return
|
9691
|
+
llama_token llama_token_nl(const struct llama_model * model) {
|
9692
|
+
return model->vocab.linefeed_id;
|
9689
9693
|
}
|
9690
|
-
|
9691
|
-
|
9694
|
+
|
9695
|
+
llama_token llama_token_prefix(const struct llama_model * model) {
|
9696
|
+
return model->vocab.special_prefix_id;
|
9692
9697
|
}
|
9693
9698
|
|
9694
|
-
llama_token llama_token_middle(const struct
|
9695
|
-
return
|
9699
|
+
llama_token llama_token_middle(const struct llama_model * model) {
|
9700
|
+
return model->vocab.special_middle_id;
|
9696
9701
|
}
|
9697
9702
|
|
9698
|
-
llama_token llama_token_suffix(const struct
|
9699
|
-
return
|
9703
|
+
llama_token llama_token_suffix(const struct llama_model * model) {
|
9704
|
+
return model->vocab.special_suffix_id;
|
9700
9705
|
}
|
9701
9706
|
|
9702
|
-
llama_token llama_token_eot(const struct
|
9703
|
-
return
|
9707
|
+
llama_token llama_token_eot(const struct llama_model * model) {
|
9708
|
+
return model->vocab.special_eot_id;
|
9704
9709
|
}
|
9705
9710
|
|
9706
9711
|
int llama_tokenize(
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -494,21 +494,22 @@ extern "C" {
|
|
494
494
|
// Vocab
|
495
495
|
//
|
496
496
|
|
497
|
-
LLAMA_API const char * llama_token_get_text(const struct
|
497
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
498
498
|
|
499
|
-
LLAMA_API float llama_token_get_score(const struct
|
499
|
+
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
500
500
|
|
501
|
-
LLAMA_API enum llama_token_type llama_token_get_type(const struct
|
501
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
502
502
|
|
503
503
|
// Special tokens
|
504
|
-
LLAMA_API llama_token llama_token_bos(const struct
|
505
|
-
LLAMA_API llama_token llama_token_eos(const struct
|
506
|
-
LLAMA_API llama_token llama_token_nl (const struct
|
504
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
505
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
506
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
507
|
+
|
507
508
|
// codellama infill tokens
|
508
|
-
LLAMA_API llama_token llama_token_prefix(const struct
|
509
|
-
LLAMA_API llama_token llama_token_middle(const struct
|
510
|
-
LLAMA_API llama_token llama_token_suffix(const struct
|
511
|
-
LLAMA_API llama_token llama_token_eot (const struct
|
509
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
510
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
511
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
512
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
512
513
|
|
513
514
|
//
|
514
515
|
// Tokenization
|
@@ -560,21 +561,15 @@ extern "C" {
|
|
560
561
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
561
562
|
|
562
563
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
563
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
564
|
-
struct llama_context * ctx,
|
565
|
-
llama_token_data_array * candidates,
|
566
|
-
const llama_token * last_tokens,
|
567
|
-
size_t last_tokens_size,
|
568
|
-
float penalty);
|
569
|
-
|
570
564
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
571
|
-
LLAMA_API void
|
565
|
+
LLAMA_API void llama_sample_repetition_penalties(
|
572
566
|
struct llama_context * ctx,
|
573
567
|
llama_token_data_array * candidates,
|
574
568
|
const llama_token * last_tokens,
|
575
|
-
size_t
|
576
|
-
float
|
577
|
-
float
|
569
|
+
size_t penalty_last_n,
|
570
|
+
float penalty_repeat,
|
571
|
+
float penalty_freq,
|
572
|
+
float penalty_present);
|
578
573
|
|
579
574
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
580
575
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.9.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1429'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
|
|
67
67
|
|
68
68
|
# apply penalties
|
69
69
|
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.
|
71
|
-
|
72
|
-
|
70
|
+
context.sample_repetition_penalties(
|
71
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
+
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
73
|
)
|
74
74
|
|
75
75
|
# temperature sampling
|
@@ -97,7 +97,7 @@ module LLaMACpp
|
|
97
97
|
|
98
98
|
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
99
|
|
100
|
-
break if !embd.empty? && embd[-1] == context.token_eos
|
100
|
+
break if !embd.empty? && embd[-1] == context.model.token_eos
|
101
101
|
end
|
102
102
|
|
103
103
|
output.join.scrub('?').strip.delete_prefix(prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -82,6 +82,16 @@ module LLaMACpp
|
|
82
82
|
def desc: () -> String
|
83
83
|
def size: () -> Integer
|
84
84
|
def n_params: () -> Integer
|
85
|
+
def text: (Integer) -> String
|
86
|
+
def score: (Integer) -> Float
|
87
|
+
def type: (Integer) -> Integer
|
88
|
+
def token_bos: () -> Integer
|
89
|
+
def token_eos: () -> Integer
|
90
|
+
def token_nl: () -> Integer
|
91
|
+
def token_prefix: () -> Integer
|
92
|
+
def token_middle: () -> Integer
|
93
|
+
def token_suffix: () -> Integer
|
94
|
+
def token_eot: () -> Integer
|
85
95
|
end
|
86
96
|
|
87
97
|
class Timings
|
@@ -143,16 +153,6 @@ module LLaMACpp
|
|
143
153
|
|
144
154
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
145
155
|
def embeddings: () -> Array[Float]
|
146
|
-
def text: (Integer) -> String
|
147
|
-
def score: (Integer) -> Float
|
148
|
-
def type: (Integer) -> Integer
|
149
|
-
def token_bos: () -> Integer
|
150
|
-
def token_eos: () -> Integer
|
151
|
-
def token_nl: () -> Integer
|
152
|
-
def token_prefix: () -> Integer
|
153
|
-
def token_middle: () -> Integer
|
154
|
-
def token_suffix: () -> Integer
|
155
|
-
def token_eot: () -> Integer
|
156
156
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
157
157
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
158
158
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -170,8 +170,7 @@ module LLaMACpp
|
|
170
170
|
def set_rng_seed: (Integer) -> void
|
171
171
|
def load_session_file: (session_path: String) -> void
|
172
172
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
173
|
-
def
|
174
|
-
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
173
|
+
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
175
174
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
176
175
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
177
176
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|