llama_cpp 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -401,15 +401,16 @@ extern "C" {
401
401
  GGML_OP_ALIBI,
402
402
  GGML_OP_CLAMP,
403
403
  GGML_OP_CONV_1D,
404
- GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_1D_STAGE_0, // internal
405
+ GGML_OP_CONV_1D_STAGE_1, // internal
405
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
+ GGML_OP_CONV_2D,
408
+ GGML_OP_CONV_2D_STAGE_0, // internal
409
+ GGML_OP_CONV_2D_STAGE_1, // internal
406
410
  GGML_OP_CONV_TRANSPOSE_2D,
407
411
  GGML_OP_POOL_1D,
408
412
  GGML_OP_POOL_2D,
409
413
 
410
- GGML_OP_CONV_1D_STAGE_0, // internal
411
- GGML_OP_CONV_1D_STAGE_1, // internal
412
-
413
414
  GGML_OP_UPSCALE, // nearest interpolate
414
415
 
415
416
  GGML_OP_FLASH_ATTN,
@@ -1020,9 +1021,9 @@ extern "C" {
1020
1021
  struct ggml_tensor * b,
1021
1022
  float eps);
1022
1023
 
1023
- // A: n columns, m rows
1024
- // B: n columns, p rows (i.e. we transpose it internally)
1025
- // result is m columns, p rows
1024
+ // A: k columns, n rows => [ne03, ne02, n, k]
1025
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1026
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1026
1027
  GGML_API struct ggml_tensor * ggml_mul_mat(
1027
1028
  struct ggml_context * ctx,
1028
1029
  struct ggml_tensor * a,
@@ -970,14 +970,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
970
970
  (void) tensor;
971
971
  }
972
972
 
973
- static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
973
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
974
974
  std::vector<char> result(8, 0);
975
975
  const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
976
976
  if (n_tokens < 0) {
977
977
  result.resize(-n_tokens);
978
978
  int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
979
979
  GGML_ASSERT(check == -n_tokens);
980
- } else {
980
+ }
981
+ else {
981
982
  result.resize(n_tokens);
982
983
  }
983
984
 
@@ -1013,8 +1014,8 @@ enum e_model {
1013
1014
  };
1014
1015
 
1015
1016
  static const size_t kB = 1024;
1016
- static const size_t MB = kB*kB;
1017
- static const size_t GB = kB*kB*kB;
1017
+ static const size_t MB = 1024*kB;
1018
+ static const size_t GB = 1024*MB;
1018
1019
 
1019
1020
  struct llama_hparams {
1020
1021
  bool vocab_only;
@@ -1037,21 +1038,21 @@ struct llama_hparams {
1037
1038
  float f_max_alibi_bias;
1038
1039
 
1039
1040
  bool operator!=(const llama_hparams & other) const {
1040
- if (this->vocab_only != other.vocab_only) return true;
1041
- if (this->n_vocab != other.n_vocab) return true;
1041
+ if (this->vocab_only != other.vocab_only) return true;
1042
+ if (this->n_vocab != other.n_vocab) return true;
1042
1043
  if (this->n_ctx_train != other.n_ctx_train) return true;
1043
- if (this->n_embd != other.n_embd) return true;
1044
- if (this->n_head != other.n_head) return true;
1045
- if (this->n_head_kv != other.n_head_kv) return true;
1046
- if (this->n_layer != other.n_layer) return true;
1047
- if (this->n_rot != other.n_rot) return true;
1048
- if (this->n_ff != other.n_ff) return true;
1044
+ if (this->n_embd != other.n_embd) return true;
1045
+ if (this->n_head != other.n_head) return true;
1046
+ if (this->n_head_kv != other.n_head_kv) return true;
1047
+ if (this->n_layer != other.n_layer) return true;
1048
+ if (this->n_rot != other.n_rot) return true;
1049
+ if (this->n_ff != other.n_ff) return true;
1049
1050
 
1050
1051
  const float EPSILON = 1e-9;
1051
1052
 
1052
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1053
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1054
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1053
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1054
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1055
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1055
1056
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1056
1057
 
1057
1058
  return false;
@@ -1190,17 +1191,17 @@ struct llama_vocab {
1190
1191
  id special_sep_id = -1;
1191
1192
  id special_pad_id = -1;
1192
1193
 
1193
- id linefeed_id = 13;
1194
+ id linefeed_id = 13;
1194
1195
  id special_prefix_id = 32007;
1195
1196
  id special_middle_id = 32009;
1196
1197
  id special_suffix_id = 32008;
1197
- id special_eot_id = 32010;
1198
+ id special_eot_id = 32010;
1198
1199
 
1199
1200
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1200
- replace_all(token_left, " ", "\u0120");
1201
- replace_all(token_left, "\n", "\u010A");
1202
- replace_all(token_right, " ", "\u0120");
1203
- replace_all(token_right, "\n", "\u010A");
1201
+ GGML_ASSERT(token_left.find(" ") == std::string::npos);
1202
+ GGML_ASSERT(token_left.find("\n") == std::string::npos);
1203
+ GGML_ASSERT(token_right.find(" ") == std::string::npos);
1204
+ GGML_ASSERT(token_right.find("\n") == std::string::npos);
1204
1205
 
1205
1206
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
1206
1207
  if (it == bpe_ranks.end()) {
@@ -1354,10 +1355,7 @@ static bool llama_kv_cache_init(
1354
1355
  cache.cells.clear();
1355
1356
  cache.cells.resize(n_ctx);
1356
1357
 
1357
- // TODO: this should be:
1358
- // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1359
- // change it and test that it works
1360
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1358
+ cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1361
1359
  memset(cache.buf.data, 0, cache.buf.size);
1362
1360
 
1363
1361
  struct ggml_init_params params;
@@ -2236,15 +2234,35 @@ static void llm_load_vocab(
2236
2234
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
2237
2235
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
2238
2236
  } else {
2239
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
2237
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
2238
+ GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2239
+ vocab.linefeed_id = ids[0];
2240
2240
  }
2241
2241
 
2242
2242
  // special tokens
2243
- GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
2244
- GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
2245
- GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
2246
- GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
2247
- GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
2243
+ {
2244
+ const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2245
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
2246
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
2247
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
2248
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
2249
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
2250
+ };
2251
+ for (const auto & it : special_token_types) {
2252
+ const std::string & key = kv(std::get<0>(it));
2253
+ int32_t & id = std::get<1>(it), old_id = id;
2254
+
2255
+ GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
2256
+ // Must be >= -1 and < vocab size. Since the key is unsigned, -1
2257
+ // can only come from the default value, so there's no point in
2258
+ // validating that.
2259
+ if (size_t(id + 1) > vocab.id_to_token.size()) {
2260
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
2261
+ __func__, key.c_str(), id, old_id);
2262
+ id = old_id;
2263
+ }
2264
+ }
2265
+ }
2248
2266
 
2249
2267
  // build special tokens cache
2250
2268
  {
@@ -6101,11 +6119,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
6101
6119
  }
6102
6120
 
6103
6121
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
6122
+ static const char * hex = "0123456789ABCDEF";
6104
6123
  switch (llama_vocab_get_type(vocab)) {
6105
6124
  case LLAMA_VOCAB_TYPE_SPM: {
6106
- char buf[7];
6107
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
6108
- GGML_ASSERT(0 <= result && result < 7);
6125
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
6109
6126
  return vocab.token_to_id.at(buf);
6110
6127
  }
6111
6128
  case LLAMA_VOCAB_TYPE_BPE: {
@@ -7412,37 +7429,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
7412
7429
  llama_sample_temp(ctx, candidates_p, temp);
7413
7430
  }
7414
7431
 
7415
- void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
7416
- if (last_tokens_size == 0 || penalty == 1.0f) {
7417
- return;
7418
- }
7419
-
7420
- const int64_t t_start_sample_us = ggml_time_us();
7421
-
7422
- for (size_t i = 0; i < candidates->size; ++i) {
7423
- const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
7424
- if (token_iter == last_tokens + last_tokens_size) {
7425
- continue;
7426
- }
7427
-
7428
- // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
7429
- // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
7430
- if (candidates->data[i].logit <= 0) {
7431
- candidates->data[i].logit *= penalty;
7432
- } else {
7433
- candidates->data[i].logit /= penalty;
7434
- }
7435
- }
7436
-
7437
- candidates->sorted = false;
7438
-
7439
- if (ctx) {
7440
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7441
- }
7442
- }
7443
-
7444
- void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
7445
- if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
7432
+ void llama_sample_repetition_penalties(
7433
+ struct llama_context * ctx,
7434
+ llama_token_data_array * candidates,
7435
+ const llama_token * last_tokens,
7436
+ size_t penalty_last_n,
7437
+ float penalty_repeat,
7438
+ float penalty_freq,
7439
+ float penalty_present) {
7440
+ if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
7446
7441
  return;
7447
7442
  }
7448
7443
 
@@ -7450,19 +7445,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
7450
7445
 
7451
7446
  // Create a frequency map to count occurrences of each token in last_tokens
7452
7447
  std::unordered_map<llama_token, int> token_count;
7453
- for (size_t i = 0; i < last_tokens_size; ++i) {
7454
- token_count[last_tokens_p[i]]++;
7448
+ for (size_t i = 0; i < penalty_last_n; ++i) {
7449
+ token_count[last_tokens[i]]++;
7455
7450
  }
7456
7451
 
7457
7452
  // Apply frequency and presence penalties to the candidates
7458
7453
  for (size_t i = 0; i < candidates->size; ++i) {
7459
- auto token_iter = token_count.find(candidates->data[i].id);
7454
+ const auto token_iter = token_count.find(candidates->data[i].id);
7460
7455
  if (token_iter == token_count.end()) {
7461
7456
  continue;
7462
7457
  }
7463
7458
 
7464
- int count = token_iter->second;
7465
- candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
7459
+ const int count = token_iter->second;
7460
+
7461
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
7462
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
7463
+ if (candidates->data[i].logit <= 0) {
7464
+ candidates->data[i].logit *= penalty_repeat;
7465
+ } else {
7466
+ candidates->data[i].logit /= penalty_repeat;
7467
+ }
7468
+
7469
+ candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
7466
7470
  }
7467
7471
 
7468
7472
  candidates->sorted = false;
@@ -7484,14 +7488,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7484
7488
  }
7485
7489
  }
7486
7490
 
7487
- const llama_token eos = llama_token_eos(ctx);
7491
+ const llama_token eos = llama_token_eos(&ctx->model);
7488
7492
 
7489
7493
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
7490
7494
  std::vector<llama_grammar_candidate> candidates_grammar;
7491
7495
 
7492
7496
  for (size_t i = 0; i < candidates->size; ++i) {
7493
7497
  const llama_token id = candidates->data[i].id;
7494
- const std::string piece = llama_token_to_str(ctx, id);
7498
+ const std::string piece = llama_token_to_piece(ctx, id);
7495
7499
  if (id == eos) {
7496
7500
  if (!allow_eos) {
7497
7501
  candidates->data[i].logit = -INFINITY;
@@ -7694,7 +7698,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
7694
7698
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
7695
7699
  const int64_t t_start_sample_us = ggml_time_us();
7696
7700
 
7697
- if (token == llama_token_eos(ctx)) {
7701
+ if (token == llama_token_eos(&ctx->model)) {
7698
7702
  for (const auto & stack : grammar->stacks) {
7699
7703
  if (stack.empty()) {
7700
7704
  return;
@@ -7703,7 +7707,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7703
7707
  GGML_ASSERT(false);
7704
7708
  }
7705
7709
 
7706
- const std::string piece = llama_token_to_str(ctx, token);
7710
+ const std::string piece = llama_token_to_piece(ctx, token);
7707
7711
 
7708
7712
  // Note terminating 0 in decoded string
7709
7713
  const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
@@ -8903,7 +8907,7 @@ struct llama_context * llama_new_context_with_model(
8903
8907
  // build worst-case graph
8904
8908
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
8905
8909
  int n_past = cparams.n_ctx - n_tokens;
8906
- llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
8910
+ llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
8907
8911
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
8908
8912
 
8909
8913
  #ifdef GGML_USE_METAL
@@ -9664,43 +9668,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
9664
9668
  return ctx->embedding.data();
9665
9669
  }
9666
9670
 
9667
- const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
9668
- return ctx->model.vocab.id_to_token[token].text.c_str();
9671
+ const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
9672
+ return model->vocab.id_to_token[token].text.c_str();
9669
9673
  }
9670
9674
 
9671
- float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
9672
- return ctx->model.vocab.id_to_token[token].score;
9675
+ float llama_token_get_score(const struct llama_model * model, llama_token token) {
9676
+ return model->vocab.id_to_token[token].score;
9673
9677
  }
9674
9678
 
9675
- llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
9676
- return ctx->model.vocab.id_to_token[token].type;
9679
+ llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
9680
+ return model->vocab.id_to_token[token].type;
9677
9681
  }
9678
9682
 
9679
- llama_token llama_token_bos(const struct llama_context * ctx) {
9680
- return ctx->model.vocab.special_bos_id;
9683
+ llama_token llama_token_bos(const struct llama_model * model) {
9684
+ return model->vocab.special_bos_id;
9681
9685
  }
9682
9686
 
9683
- llama_token llama_token_eos(const struct llama_context * ctx) {
9684
- return ctx->model.vocab.special_eos_id;
9687
+ llama_token llama_token_eos(const struct llama_model * model) {
9688
+ return model->vocab.special_eos_id;
9685
9689
  }
9686
9690
 
9687
- llama_token llama_token_nl(const struct llama_context * ctx) {
9688
- return ctx->model.vocab.linefeed_id;
9691
+ llama_token llama_token_nl(const struct llama_model * model) {
9692
+ return model->vocab.linefeed_id;
9689
9693
  }
9690
- llama_token llama_token_prefix(const struct llama_context * ctx) {
9691
- return ctx->model.vocab.special_prefix_id;
9694
+
9695
+ llama_token llama_token_prefix(const struct llama_model * model) {
9696
+ return model->vocab.special_prefix_id;
9692
9697
  }
9693
9698
 
9694
- llama_token llama_token_middle(const struct llama_context * ctx) {
9695
- return ctx->model.vocab.special_middle_id;
9699
+ llama_token llama_token_middle(const struct llama_model * model) {
9700
+ return model->vocab.special_middle_id;
9696
9701
  }
9697
9702
 
9698
- llama_token llama_token_suffix(const struct llama_context * ctx) {
9699
- return ctx->model.vocab.special_suffix_id;
9703
+ llama_token llama_token_suffix(const struct llama_model * model) {
9704
+ return model->vocab.special_suffix_id;
9700
9705
  }
9701
9706
 
9702
- llama_token llama_token_eot(const struct llama_context * ctx) {
9703
- return ctx->model.vocab.special_eot_id;
9707
+ llama_token llama_token_eot(const struct llama_model * model) {
9708
+ return model->vocab.special_eot_id;
9704
9709
  }
9705
9710
 
9706
9711
  int llama_tokenize(
@@ -494,21 +494,22 @@ extern "C" {
494
494
  // Vocab
495
495
  //
496
496
 
497
- LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
497
+ LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
498
498
 
499
- LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
499
+ LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
500
500
 
501
- LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
501
+ LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
502
502
 
503
503
  // Special tokens
504
- LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
505
- LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
506
- LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
504
+ LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
505
+ LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
506
+ LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
507
+
507
508
  // codellama infill tokens
508
- LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
509
- LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
510
- LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
511
- LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
509
+ LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
510
+ LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
511
+ LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
512
+ LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
512
513
 
513
514
  //
514
515
  // Tokenization
@@ -560,21 +561,15 @@ extern "C" {
560
561
  LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
561
562
 
562
563
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
563
- LLAMA_API void llama_sample_repetition_penalty(
564
- struct llama_context * ctx,
565
- llama_token_data_array * candidates,
566
- const llama_token * last_tokens,
567
- size_t last_tokens_size,
568
- float penalty);
569
-
570
564
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
571
- LLAMA_API void llama_sample_frequency_and_presence_penalties(
565
+ LLAMA_API void llama_sample_repetition_penalties(
572
566
  struct llama_context * ctx,
573
567
  llama_token_data_array * candidates,
574
568
  const llama_token * last_tokens,
575
- size_t last_tokens_size,
576
- float alpha_frequency,
577
- float alpha_presence);
569
+ size_t penalty_last_n,
570
+ float penalty_repeat,
571
+ float penalty_freq,
572
+ float penalty_present);
578
573
 
579
574
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
580
575
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.8.0'
6
+ VERSION = '0.9.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1405'
9
+ LLAMA_CPP_VERSION = 'b1429'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
67
67
 
68
68
  # apply penalties
69
69
  last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
70
- context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
71
- context.sample_frequency_and_presence_penalties(
72
- candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
70
+ context.sample_repetition_penalties(
71
+ candidates, last_n_tokens[-last_n_repeat..],
72
+ penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
73
73
  )
74
74
 
75
75
  # temperature sampling
@@ -97,7 +97,7 @@ module LLaMACpp
97
97
 
98
98
  embd.each { |token| output << context.model.token_to_piece(token) }
99
99
 
100
- break if !embd.empty? && embd[-1] == context.token_eos
100
+ break if !embd.empty? && embd[-1] == context.model.token_eos
101
101
  end
102
102
 
103
103
  output.join.scrub('?').strip.delete_prefix(prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -82,6 +82,16 @@ module LLaMACpp
82
82
  def desc: () -> String
83
83
  def size: () -> Integer
84
84
  def n_params: () -> Integer
85
+ def text: (Integer) -> String
86
+ def score: (Integer) -> Float
87
+ def type: (Integer) -> Integer
88
+ def token_bos: () -> Integer
89
+ def token_eos: () -> Integer
90
+ def token_nl: () -> Integer
91
+ def token_prefix: () -> Integer
92
+ def token_middle: () -> Integer
93
+ def token_suffix: () -> Integer
94
+ def token_eot: () -> Integer
85
95
  end
86
96
 
87
97
  class Timings
@@ -143,16 +153,6 @@ module LLaMACpp
143
153
 
144
154
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
145
155
  def embeddings: () -> Array[Float]
146
- def text: (Integer) -> String
147
- def score: (Integer) -> Float
148
- def type: (Integer) -> Integer
149
- def token_bos: () -> Integer
150
- def token_eos: () -> Integer
151
- def token_nl: () -> Integer
152
- def token_prefix: () -> Integer
153
- def token_middle: () -> Integer
154
- def token_suffix: () -> Integer
155
- def token_eot: () -> Integer
156
156
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
157
157
  def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
158
158
  def decode: (::LLaMACpp::Batch) -> void
@@ -170,8 +170,7 @@ module LLaMACpp
170
170
  def set_rng_seed: (Integer) -> void
171
171
  def load_session_file: (session_path: String) -> void
172
172
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
173
- def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
174
- def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
173
+ def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
175
174
  def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
176
175
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
177
176
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-21 00:00:00.000000000 Z
11
+ date: 2023-10-28 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: