llama_cpp 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +81 -162
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +13 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/ggml.c +362 -84
- data/ext/llama_cpp/src/ggml.h +8 -7
- data/ext/llama_cpp/src/llama.cpp +100 -95
- data/ext/llama_cpp/src/llama.h +16 -21
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +11 -12
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -401,15 +401,16 @@ extern "C" {
|
|
401
401
|
GGML_OP_ALIBI,
|
402
402
|
GGML_OP_CLAMP,
|
403
403
|
GGML_OP_CONV_1D,
|
404
|
-
|
404
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
405
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
+
GGML_OP_CONV_2D,
|
408
|
+
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
+
GGML_OP_CONV_2D_STAGE_1, // internal
|
406
410
|
GGML_OP_CONV_TRANSPOSE_2D,
|
407
411
|
GGML_OP_POOL_1D,
|
408
412
|
GGML_OP_POOL_2D,
|
409
413
|
|
410
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
411
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
412
|
-
|
413
414
|
GGML_OP_UPSCALE, // nearest interpolate
|
414
415
|
|
415
416
|
GGML_OP_FLASH_ATTN,
|
@@ -1020,9 +1021,9 @@ extern "C" {
|
|
1020
1021
|
struct ggml_tensor * b,
|
1021
1022
|
float eps);
|
1022
1023
|
|
1023
|
-
// A:
|
1024
|
-
// B:
|
1025
|
-
// result is
|
1024
|
+
// A: k columns, n rows => [ne03, ne02, n, k]
|
1025
|
+
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
1026
|
+
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
1026
1027
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
1027
1028
|
struct ggml_context * ctx,
|
1028
1029
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -970,14 +970,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
970
970
|
(void) tensor;
|
971
971
|
}
|
972
972
|
|
973
|
-
static std::string
|
973
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
974
974
|
std::vector<char> result(8, 0);
|
975
975
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
976
976
|
if (n_tokens < 0) {
|
977
977
|
result.resize(-n_tokens);
|
978
978
|
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
979
979
|
GGML_ASSERT(check == -n_tokens);
|
980
|
-
}
|
980
|
+
}
|
981
|
+
else {
|
981
982
|
result.resize(n_tokens);
|
982
983
|
}
|
983
984
|
|
@@ -1013,8 +1014,8 @@ enum e_model {
|
|
1013
1014
|
};
|
1014
1015
|
|
1015
1016
|
static const size_t kB = 1024;
|
1016
|
-
static const size_t MB =
|
1017
|
-
static const size_t GB =
|
1017
|
+
static const size_t MB = 1024*kB;
|
1018
|
+
static const size_t GB = 1024*MB;
|
1018
1019
|
|
1019
1020
|
struct llama_hparams {
|
1020
1021
|
bool vocab_only;
|
@@ -1037,21 +1038,21 @@ struct llama_hparams {
|
|
1037
1038
|
float f_max_alibi_bias;
|
1038
1039
|
|
1039
1040
|
bool operator!=(const llama_hparams & other) const {
|
1040
|
-
if (this->vocab_only
|
1041
|
-
if (this->n_vocab
|
1041
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1042
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1042
1043
|
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1043
|
-
if (this->n_embd
|
1044
|
-
if (this->n_head
|
1045
|
-
if (this->n_head_kv
|
1046
|
-
if (this->n_layer
|
1047
|
-
if (this->n_rot
|
1048
|
-
if (this->n_ff
|
1044
|
+
if (this->n_embd != other.n_embd) return true;
|
1045
|
+
if (this->n_head != other.n_head) return true;
|
1046
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1047
|
+
if (this->n_layer != other.n_layer) return true;
|
1048
|
+
if (this->n_rot != other.n_rot) return true;
|
1049
|
+
if (this->n_ff != other.n_ff) return true;
|
1049
1050
|
|
1050
1051
|
const float EPSILON = 1e-9;
|
1051
1052
|
|
1052
|
-
if (!is_float_close(this->f_norm_eps,
|
1053
|
-
if (!is_float_close(this->f_norm_rms_eps,
|
1054
|
-
if (!is_float_close(this->rope_freq_base_train,
|
1053
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1054
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1055
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1055
1056
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1056
1057
|
|
1057
1058
|
return false;
|
@@ -1190,17 +1191,17 @@ struct llama_vocab {
|
|
1190
1191
|
id special_sep_id = -1;
|
1191
1192
|
id special_pad_id = -1;
|
1192
1193
|
|
1193
|
-
id linefeed_id
|
1194
|
+
id linefeed_id = 13;
|
1194
1195
|
id special_prefix_id = 32007;
|
1195
1196
|
id special_middle_id = 32009;
|
1196
1197
|
id special_suffix_id = 32008;
|
1197
|
-
id special_eot_id
|
1198
|
+
id special_eot_id = 32010;
|
1198
1199
|
|
1199
1200
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1201
|
+
GGML_ASSERT(token_left.find(" ") == std::string::npos);
|
1202
|
+
GGML_ASSERT(token_left.find("\n") == std::string::npos);
|
1203
|
+
GGML_ASSERT(token_right.find(" ") == std::string::npos);
|
1204
|
+
GGML_ASSERT(token_right.find("\n") == std::string::npos);
|
1204
1205
|
|
1205
1206
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
1206
1207
|
if (it == bpe_ranks.end()) {
|
@@ -1354,10 +1355,7 @@ static bool llama_kv_cache_init(
|
|
1354
1355
|
cache.cells.clear();
|
1355
1356
|
cache.cells.resize(n_ctx);
|
1356
1357
|
|
1357
|
-
|
1358
|
-
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1359
|
-
// change it and test that it works
|
1360
|
-
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1361
1359
|
memset(cache.buf.data, 0, cache.buf.size);
|
1362
1360
|
|
1363
1361
|
struct ggml_init_params params;
|
@@ -2236,15 +2234,35 @@ static void llm_load_vocab(
|
|
2236
2234
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2237
2235
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2238
2236
|
} else {
|
2239
|
-
|
2237
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
2238
|
+
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
2239
|
+
vocab.linefeed_id = ids[0];
|
2240
2240
|
}
|
2241
2241
|
|
2242
2242
|
// special tokens
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2243
|
+
{
|
2244
|
+
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
2245
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
2246
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
2247
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
2248
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
2249
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
2250
|
+
};
|
2251
|
+
for (const auto & it : special_token_types) {
|
2252
|
+
const std::string & key = kv(std::get<0>(it));
|
2253
|
+
int32_t & id = std::get<1>(it), old_id = id;
|
2254
|
+
|
2255
|
+
GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
|
2256
|
+
// Must be >= -1 and < vocab size. Since the key is unsigned, -1
|
2257
|
+
// can only come from the default value, so there's no point in
|
2258
|
+
// validating that.
|
2259
|
+
if (size_t(id + 1) > vocab.id_to_token.size()) {
|
2260
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
|
2261
|
+
__func__, key.c_str(), id, old_id);
|
2262
|
+
id = old_id;
|
2263
|
+
}
|
2264
|
+
}
|
2265
|
+
}
|
2248
2266
|
|
2249
2267
|
// build special tokens cache
|
2250
2268
|
{
|
@@ -6101,11 +6119,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
6101
6119
|
}
|
6102
6120
|
|
6103
6121
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
6122
|
+
static const char * hex = "0123456789ABCDEF";
|
6104
6123
|
switch (llama_vocab_get_type(vocab)) {
|
6105
6124
|
case LLAMA_VOCAB_TYPE_SPM: {
|
6106
|
-
char buf[7];
|
6107
|
-
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
6108
|
-
GGML_ASSERT(0 <= result && result < 7);
|
6125
|
+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
6109
6126
|
return vocab.token_to_id.at(buf);
|
6110
6127
|
}
|
6111
6128
|
case LLAMA_VOCAB_TYPE_BPE: {
|
@@ -7412,37 +7429,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
7412
7429
|
llama_sample_temp(ctx, candidates_p, temp);
|
7413
7430
|
}
|
7414
7431
|
|
7415
|
-
void
|
7416
|
-
|
7417
|
-
|
7418
|
-
|
7419
|
-
|
7420
|
-
|
7421
|
-
|
7422
|
-
|
7423
|
-
|
7424
|
-
if (token_iter == last_tokens + last_tokens_size) {
|
7425
|
-
continue;
|
7426
|
-
}
|
7427
|
-
|
7428
|
-
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
7429
|
-
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
7430
|
-
if (candidates->data[i].logit <= 0) {
|
7431
|
-
candidates->data[i].logit *= penalty;
|
7432
|
-
} else {
|
7433
|
-
candidates->data[i].logit /= penalty;
|
7434
|
-
}
|
7435
|
-
}
|
7436
|
-
|
7437
|
-
candidates->sorted = false;
|
7438
|
-
|
7439
|
-
if (ctx) {
|
7440
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
7441
|
-
}
|
7442
|
-
}
|
7443
|
-
|
7444
|
-
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
7445
|
-
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
7432
|
+
void llama_sample_repetition_penalties(
|
7433
|
+
struct llama_context * ctx,
|
7434
|
+
llama_token_data_array * candidates,
|
7435
|
+
const llama_token * last_tokens,
|
7436
|
+
size_t penalty_last_n,
|
7437
|
+
float penalty_repeat,
|
7438
|
+
float penalty_freq,
|
7439
|
+
float penalty_present) {
|
7440
|
+
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
|
7446
7441
|
return;
|
7447
7442
|
}
|
7448
7443
|
|
@@ -7450,19 +7445,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
7450
7445
|
|
7451
7446
|
// Create a frequency map to count occurrences of each token in last_tokens
|
7452
7447
|
std::unordered_map<llama_token, int> token_count;
|
7453
|
-
for (size_t i = 0; i <
|
7454
|
-
token_count[
|
7448
|
+
for (size_t i = 0; i < penalty_last_n; ++i) {
|
7449
|
+
token_count[last_tokens[i]]++;
|
7455
7450
|
}
|
7456
7451
|
|
7457
7452
|
// Apply frequency and presence penalties to the candidates
|
7458
7453
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7459
|
-
auto token_iter = token_count.find(candidates->data[i].id);
|
7454
|
+
const auto token_iter = token_count.find(candidates->data[i].id);
|
7460
7455
|
if (token_iter == token_count.end()) {
|
7461
7456
|
continue;
|
7462
7457
|
}
|
7463
7458
|
|
7464
|
-
int count = token_iter->second;
|
7465
|
-
|
7459
|
+
const int count = token_iter->second;
|
7460
|
+
|
7461
|
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
7462
|
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
7463
|
+
if (candidates->data[i].logit <= 0) {
|
7464
|
+
candidates->data[i].logit *= penalty_repeat;
|
7465
|
+
} else {
|
7466
|
+
candidates->data[i].logit /= penalty_repeat;
|
7467
|
+
}
|
7468
|
+
|
7469
|
+
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
7466
7470
|
}
|
7467
7471
|
|
7468
7472
|
candidates->sorted = false;
|
@@ -7484,14 +7488,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7484
7488
|
}
|
7485
7489
|
}
|
7486
7490
|
|
7487
|
-
const llama_token eos = llama_token_eos(ctx);
|
7491
|
+
const llama_token eos = llama_token_eos(&ctx->model);
|
7488
7492
|
|
7489
7493
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7490
7494
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7491
7495
|
|
7492
7496
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7493
7497
|
const llama_token id = candidates->data[i].id;
|
7494
|
-
const std::string piece =
|
7498
|
+
const std::string piece = llama_token_to_piece(ctx, id);
|
7495
7499
|
if (id == eos) {
|
7496
7500
|
if (!allow_eos) {
|
7497
7501
|
candidates->data[i].logit = -INFINITY;
|
@@ -7694,7 +7698,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
7694
7698
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
7695
7699
|
const int64_t t_start_sample_us = ggml_time_us();
|
7696
7700
|
|
7697
|
-
if (token == llama_token_eos(ctx)) {
|
7701
|
+
if (token == llama_token_eos(&ctx->model)) {
|
7698
7702
|
for (const auto & stack : grammar->stacks) {
|
7699
7703
|
if (stack.empty()) {
|
7700
7704
|
return;
|
@@ -7703,7 +7707,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7703
7707
|
GGML_ASSERT(false);
|
7704
7708
|
}
|
7705
7709
|
|
7706
|
-
const std::string piece =
|
7710
|
+
const std::string piece = llama_token_to_piece(ctx, token);
|
7707
7711
|
|
7708
7712
|
// Note terminating 0 in decoded string
|
7709
7713
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
@@ -8903,7 +8907,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8903
8907
|
// build worst-case graph
|
8904
8908
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
8905
8909
|
int n_past = cparams.n_ctx - n_tokens;
|
8906
|
-
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8910
|
+
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8907
8911
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
8908
8912
|
|
8909
8913
|
#ifdef GGML_USE_METAL
|
@@ -9664,43 +9668,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
9664
9668
|
return ctx->embedding.data();
|
9665
9669
|
}
|
9666
9670
|
|
9667
|
-
const char * llama_token_get_text(const struct
|
9668
|
-
return
|
9671
|
+
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
9672
|
+
return model->vocab.id_to_token[token].text.c_str();
|
9669
9673
|
}
|
9670
9674
|
|
9671
|
-
float llama_token_get_score(const struct
|
9672
|
-
return
|
9675
|
+
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
9676
|
+
return model->vocab.id_to_token[token].score;
|
9673
9677
|
}
|
9674
9678
|
|
9675
|
-
llama_token_type llama_token_get_type(const struct
|
9676
|
-
return
|
9679
|
+
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
9680
|
+
return model->vocab.id_to_token[token].type;
|
9677
9681
|
}
|
9678
9682
|
|
9679
|
-
llama_token llama_token_bos(const struct
|
9680
|
-
return
|
9683
|
+
llama_token llama_token_bos(const struct llama_model * model) {
|
9684
|
+
return model->vocab.special_bos_id;
|
9681
9685
|
}
|
9682
9686
|
|
9683
|
-
llama_token llama_token_eos(const struct
|
9684
|
-
return
|
9687
|
+
llama_token llama_token_eos(const struct llama_model * model) {
|
9688
|
+
return model->vocab.special_eos_id;
|
9685
9689
|
}
|
9686
9690
|
|
9687
|
-
llama_token llama_token_nl(const struct
|
9688
|
-
return
|
9691
|
+
llama_token llama_token_nl(const struct llama_model * model) {
|
9692
|
+
return model->vocab.linefeed_id;
|
9689
9693
|
}
|
9690
|
-
|
9691
|
-
|
9694
|
+
|
9695
|
+
llama_token llama_token_prefix(const struct llama_model * model) {
|
9696
|
+
return model->vocab.special_prefix_id;
|
9692
9697
|
}
|
9693
9698
|
|
9694
|
-
llama_token llama_token_middle(const struct
|
9695
|
-
return
|
9699
|
+
llama_token llama_token_middle(const struct llama_model * model) {
|
9700
|
+
return model->vocab.special_middle_id;
|
9696
9701
|
}
|
9697
9702
|
|
9698
|
-
llama_token llama_token_suffix(const struct
|
9699
|
-
return
|
9703
|
+
llama_token llama_token_suffix(const struct llama_model * model) {
|
9704
|
+
return model->vocab.special_suffix_id;
|
9700
9705
|
}
|
9701
9706
|
|
9702
|
-
llama_token llama_token_eot(const struct
|
9703
|
-
return
|
9707
|
+
llama_token llama_token_eot(const struct llama_model * model) {
|
9708
|
+
return model->vocab.special_eot_id;
|
9704
9709
|
}
|
9705
9710
|
|
9706
9711
|
int llama_tokenize(
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -494,21 +494,22 @@ extern "C" {
|
|
494
494
|
// Vocab
|
495
495
|
//
|
496
496
|
|
497
|
-
LLAMA_API const char * llama_token_get_text(const struct
|
497
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
498
498
|
|
499
|
-
LLAMA_API float llama_token_get_score(const struct
|
499
|
+
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
500
500
|
|
501
|
-
LLAMA_API enum llama_token_type llama_token_get_type(const struct
|
501
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
502
502
|
|
503
503
|
// Special tokens
|
504
|
-
LLAMA_API llama_token llama_token_bos(const struct
|
505
|
-
LLAMA_API llama_token llama_token_eos(const struct
|
506
|
-
LLAMA_API llama_token llama_token_nl (const struct
|
504
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
505
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
506
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
507
|
+
|
507
508
|
// codellama infill tokens
|
508
|
-
LLAMA_API llama_token llama_token_prefix(const struct
|
509
|
-
LLAMA_API llama_token llama_token_middle(const struct
|
510
|
-
LLAMA_API llama_token llama_token_suffix(const struct
|
511
|
-
LLAMA_API llama_token llama_token_eot (const struct
|
509
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
510
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
511
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
512
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
512
513
|
|
513
514
|
//
|
514
515
|
// Tokenization
|
@@ -560,21 +561,15 @@ extern "C" {
|
|
560
561
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
561
562
|
|
562
563
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
563
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
564
|
-
struct llama_context * ctx,
|
565
|
-
llama_token_data_array * candidates,
|
566
|
-
const llama_token * last_tokens,
|
567
|
-
size_t last_tokens_size,
|
568
|
-
float penalty);
|
569
|
-
|
570
564
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
571
|
-
LLAMA_API void
|
565
|
+
LLAMA_API void llama_sample_repetition_penalties(
|
572
566
|
struct llama_context * ctx,
|
573
567
|
llama_token_data_array * candidates,
|
574
568
|
const llama_token * last_tokens,
|
575
|
-
size_t
|
576
|
-
float
|
577
|
-
float
|
569
|
+
size_t penalty_last_n,
|
570
|
+
float penalty_repeat,
|
571
|
+
float penalty_freq,
|
572
|
+
float penalty_present);
|
578
573
|
|
579
574
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
580
575
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.9.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1429'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
|
|
67
67
|
|
68
68
|
# apply penalties
|
69
69
|
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.
|
71
|
-
|
72
|
-
|
70
|
+
context.sample_repetition_penalties(
|
71
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
+
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
73
|
)
|
74
74
|
|
75
75
|
# temperature sampling
|
@@ -97,7 +97,7 @@ module LLaMACpp
|
|
97
97
|
|
98
98
|
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
99
|
|
100
|
-
break if !embd.empty? && embd[-1] == context.token_eos
|
100
|
+
break if !embd.empty? && embd[-1] == context.model.token_eos
|
101
101
|
end
|
102
102
|
|
103
103
|
output.join.scrub('?').strip.delete_prefix(prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -82,6 +82,16 @@ module LLaMACpp
|
|
82
82
|
def desc: () -> String
|
83
83
|
def size: () -> Integer
|
84
84
|
def n_params: () -> Integer
|
85
|
+
def text: (Integer) -> String
|
86
|
+
def score: (Integer) -> Float
|
87
|
+
def type: (Integer) -> Integer
|
88
|
+
def token_bos: () -> Integer
|
89
|
+
def token_eos: () -> Integer
|
90
|
+
def token_nl: () -> Integer
|
91
|
+
def token_prefix: () -> Integer
|
92
|
+
def token_middle: () -> Integer
|
93
|
+
def token_suffix: () -> Integer
|
94
|
+
def token_eot: () -> Integer
|
85
95
|
end
|
86
96
|
|
87
97
|
class Timings
|
@@ -143,16 +153,6 @@ module LLaMACpp
|
|
143
153
|
|
144
154
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
145
155
|
def embeddings: () -> Array[Float]
|
146
|
-
def text: (Integer) -> String
|
147
|
-
def score: (Integer) -> Float
|
148
|
-
def type: (Integer) -> Integer
|
149
|
-
def token_bos: () -> Integer
|
150
|
-
def token_eos: () -> Integer
|
151
|
-
def token_nl: () -> Integer
|
152
|
-
def token_prefix: () -> Integer
|
153
|
-
def token_middle: () -> Integer
|
154
|
-
def token_suffix: () -> Integer
|
155
|
-
def token_eot: () -> Integer
|
156
156
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
157
157
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
158
158
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -170,8 +170,7 @@ module LLaMACpp
|
|
170
170
|
def set_rng_seed: (Integer) -> void
|
171
171
|
def load_session_file: (session_path: String) -> void
|
172
172
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
173
|
-
def
|
174
|
-
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
173
|
+
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
175
174
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
176
175
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
177
176
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|