llama_cpp 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -401,15 +401,16 @@ extern "C" {
401
401
  GGML_OP_ALIBI,
402
402
  GGML_OP_CLAMP,
403
403
  GGML_OP_CONV_1D,
404
- GGML_OP_CONV_2D,
404
+ GGML_OP_CONV_1D_STAGE_0, // internal
405
+ GGML_OP_CONV_1D_STAGE_1, // internal
405
406
  GGML_OP_CONV_TRANSPOSE_1D,
407
+ GGML_OP_CONV_2D,
408
+ GGML_OP_CONV_2D_STAGE_0, // internal
409
+ GGML_OP_CONV_2D_STAGE_1, // internal
406
410
  GGML_OP_CONV_TRANSPOSE_2D,
407
411
  GGML_OP_POOL_1D,
408
412
  GGML_OP_POOL_2D,
409
413
 
410
- GGML_OP_CONV_1D_STAGE_0, // internal
411
- GGML_OP_CONV_1D_STAGE_1, // internal
412
-
413
414
  GGML_OP_UPSCALE, // nearest interpolate
414
415
 
415
416
  GGML_OP_FLASH_ATTN,
@@ -1020,9 +1021,9 @@ extern "C" {
1020
1021
  struct ggml_tensor * b,
1021
1022
  float eps);
1022
1023
 
1023
- // A: n columns, m rows
1024
- // B: n columns, p rows (i.e. we transpose it internally)
1025
- // result is m columns, p rows
1024
+ // A: k columns, n rows => [ne03, ne02, n, k]
1025
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1026
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1026
1027
  GGML_API struct ggml_tensor * ggml_mul_mat(
1027
1028
  struct ggml_context * ctx,
1028
1029
  struct ggml_tensor * a,
@@ -970,14 +970,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
970
970
  (void) tensor;
971
971
  }
972
972
 
973
- static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
973
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
974
974
  std::vector<char> result(8, 0);
975
975
  const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
976
976
  if (n_tokens < 0) {
977
977
  result.resize(-n_tokens);
978
978
  int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
979
979
  GGML_ASSERT(check == -n_tokens);
980
- } else {
980
+ }
981
+ else {
981
982
  result.resize(n_tokens);
982
983
  }
983
984
 
@@ -1013,8 +1014,8 @@ enum e_model {
1013
1014
  };
1014
1015
 
1015
1016
  static const size_t kB = 1024;
1016
- static const size_t MB = kB*kB;
1017
- static const size_t GB = kB*kB*kB;
1017
+ static const size_t MB = 1024*kB;
1018
+ static const size_t GB = 1024*MB;
1018
1019
 
1019
1020
  struct llama_hparams {
1020
1021
  bool vocab_only;
@@ -1037,21 +1038,21 @@ struct llama_hparams {
1037
1038
  float f_max_alibi_bias;
1038
1039
 
1039
1040
  bool operator!=(const llama_hparams & other) const {
1040
- if (this->vocab_only != other.vocab_only) return true;
1041
- if (this->n_vocab != other.n_vocab) return true;
1041
+ if (this->vocab_only != other.vocab_only) return true;
1042
+ if (this->n_vocab != other.n_vocab) return true;
1042
1043
  if (this->n_ctx_train != other.n_ctx_train) return true;
1043
- if (this->n_embd != other.n_embd) return true;
1044
- if (this->n_head != other.n_head) return true;
1045
- if (this->n_head_kv != other.n_head_kv) return true;
1046
- if (this->n_layer != other.n_layer) return true;
1047
- if (this->n_rot != other.n_rot) return true;
1048
- if (this->n_ff != other.n_ff) return true;
1044
+ if (this->n_embd != other.n_embd) return true;
1045
+ if (this->n_head != other.n_head) return true;
1046
+ if (this->n_head_kv != other.n_head_kv) return true;
1047
+ if (this->n_layer != other.n_layer) return true;
1048
+ if (this->n_rot != other.n_rot) return true;
1049
+ if (this->n_ff != other.n_ff) return true;
1049
1050
 
1050
1051
  const float EPSILON = 1e-9;
1051
1052
 
1052
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1053
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1054
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1053
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1054
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1055
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1055
1056
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1056
1057
 
1057
1058
  return false;
@@ -1190,17 +1191,17 @@ struct llama_vocab {
1190
1191
  id special_sep_id = -1;
1191
1192
  id special_pad_id = -1;
1192
1193
 
1193
- id linefeed_id = 13;
1194
+ id linefeed_id = 13;
1194
1195
  id special_prefix_id = 32007;
1195
1196
  id special_middle_id = 32009;
1196
1197
  id special_suffix_id = 32008;
1197
- id special_eot_id = 32010;
1198
+ id special_eot_id = 32010;
1198
1199
 
1199
1200
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1200
- replace_all(token_left, " ", "\u0120");
1201
- replace_all(token_left, "\n", "\u010A");
1202
- replace_all(token_right, " ", "\u0120");
1203
- replace_all(token_right, "\n", "\u010A");
1201
+ GGML_ASSERT(token_left.find(" ") == std::string::npos);
1202
+ GGML_ASSERT(token_left.find("\n") == std::string::npos);
1203
+ GGML_ASSERT(token_right.find(" ") == std::string::npos);
1204
+ GGML_ASSERT(token_right.find("\n") == std::string::npos);
1204
1205
 
1205
1206
  auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
1206
1207
  if (it == bpe_ranks.end()) {
@@ -1354,10 +1355,7 @@ static bool llama_kv_cache_init(
1354
1355
  cache.cells.clear();
1355
1356
  cache.cells.resize(n_ctx);
1356
1357
 
1357
- // TODO: this should be:
1358
- // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1359
- // change it and test that it works
1360
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1358
+ cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1361
1359
  memset(cache.buf.data, 0, cache.buf.size);
1362
1360
 
1363
1361
  struct ggml_init_params params;
@@ -2236,15 +2234,35 @@ static void llm_load_vocab(
2236
2234
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
2237
2235
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
2238
2236
  } else {
2239
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
2237
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
2238
+ GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2239
+ vocab.linefeed_id = ids[0];
2240
2240
  }
2241
2241
 
2242
2242
  // special tokens
2243
- GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
2244
- GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
2245
- GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
2246
- GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
2247
- GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
2243
+ {
2244
+ const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2245
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
2246
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
2247
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
2248
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
2249
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
2250
+ };
2251
+ for (const auto & it : special_token_types) {
2252
+ const std::string & key = kv(std::get<0>(it));
2253
+ int32_t & id = std::get<1>(it), old_id = id;
2254
+
2255
+ GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
2256
+ // Must be >= -1 and < vocab size. Since the key is unsigned, -1
2257
+ // can only come from the default value, so there's no point in
2258
+ // validating that.
2259
+ if (size_t(id + 1) > vocab.id_to_token.size()) {
2260
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
2261
+ __func__, key.c_str(), id, old_id);
2262
+ id = old_id;
2263
+ }
2264
+ }
2265
+ }
2248
2266
 
2249
2267
  // build special tokens cache
2250
2268
  {
@@ -6101,11 +6119,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
6101
6119
  }
6102
6120
 
6103
6121
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
6122
+ static const char * hex = "0123456789ABCDEF";
6104
6123
  switch (llama_vocab_get_type(vocab)) {
6105
6124
  case LLAMA_VOCAB_TYPE_SPM: {
6106
- char buf[7];
6107
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
6108
- GGML_ASSERT(0 <= result && result < 7);
6125
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
6109
6126
  return vocab.token_to_id.at(buf);
6110
6127
  }
6111
6128
  case LLAMA_VOCAB_TYPE_BPE: {
@@ -7412,37 +7429,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
7412
7429
  llama_sample_temp(ctx, candidates_p, temp);
7413
7430
  }
7414
7431
 
7415
- void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
7416
- if (last_tokens_size == 0 || penalty == 1.0f) {
7417
- return;
7418
- }
7419
-
7420
- const int64_t t_start_sample_us = ggml_time_us();
7421
-
7422
- for (size_t i = 0; i < candidates->size; ++i) {
7423
- const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
7424
- if (token_iter == last_tokens + last_tokens_size) {
7425
- continue;
7426
- }
7427
-
7428
- // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
7429
- // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
7430
- if (candidates->data[i].logit <= 0) {
7431
- candidates->data[i].logit *= penalty;
7432
- } else {
7433
- candidates->data[i].logit /= penalty;
7434
- }
7435
- }
7436
-
7437
- candidates->sorted = false;
7438
-
7439
- if (ctx) {
7440
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7441
- }
7442
- }
7443
-
7444
- void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
7445
- if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
7432
+ void llama_sample_repetition_penalties(
7433
+ struct llama_context * ctx,
7434
+ llama_token_data_array * candidates,
7435
+ const llama_token * last_tokens,
7436
+ size_t penalty_last_n,
7437
+ float penalty_repeat,
7438
+ float penalty_freq,
7439
+ float penalty_present) {
7440
+ if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
7446
7441
  return;
7447
7442
  }
7448
7443
 
@@ -7450,19 +7445,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
7450
7445
 
7451
7446
  // Create a frequency map to count occurrences of each token in last_tokens
7452
7447
  std::unordered_map<llama_token, int> token_count;
7453
- for (size_t i = 0; i < last_tokens_size; ++i) {
7454
- token_count[last_tokens_p[i]]++;
7448
+ for (size_t i = 0; i < penalty_last_n; ++i) {
7449
+ token_count[last_tokens[i]]++;
7455
7450
  }
7456
7451
 
7457
7452
  // Apply frequency and presence penalties to the candidates
7458
7453
  for (size_t i = 0; i < candidates->size; ++i) {
7459
- auto token_iter = token_count.find(candidates->data[i].id);
7454
+ const auto token_iter = token_count.find(candidates->data[i].id);
7460
7455
  if (token_iter == token_count.end()) {
7461
7456
  continue;
7462
7457
  }
7463
7458
 
7464
- int count = token_iter->second;
7465
- candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
7459
+ const int count = token_iter->second;
7460
+
7461
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
7462
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
7463
+ if (candidates->data[i].logit <= 0) {
7464
+ candidates->data[i].logit *= penalty_repeat;
7465
+ } else {
7466
+ candidates->data[i].logit /= penalty_repeat;
7467
+ }
7468
+
7469
+ candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
7466
7470
  }
7467
7471
 
7468
7472
  candidates->sorted = false;
@@ -7484,14 +7488,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7484
7488
  }
7485
7489
  }
7486
7490
 
7487
- const llama_token eos = llama_token_eos(ctx);
7491
+ const llama_token eos = llama_token_eos(&ctx->model);
7488
7492
 
7489
7493
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
7490
7494
  std::vector<llama_grammar_candidate> candidates_grammar;
7491
7495
 
7492
7496
  for (size_t i = 0; i < candidates->size; ++i) {
7493
7497
  const llama_token id = candidates->data[i].id;
7494
- const std::string piece = llama_token_to_str(ctx, id);
7498
+ const std::string piece = llama_token_to_piece(ctx, id);
7495
7499
  if (id == eos) {
7496
7500
  if (!allow_eos) {
7497
7501
  candidates->data[i].logit = -INFINITY;
@@ -7694,7 +7698,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
7694
7698
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
7695
7699
  const int64_t t_start_sample_us = ggml_time_us();
7696
7700
 
7697
- if (token == llama_token_eos(ctx)) {
7701
+ if (token == llama_token_eos(&ctx->model)) {
7698
7702
  for (const auto & stack : grammar->stacks) {
7699
7703
  if (stack.empty()) {
7700
7704
  return;
@@ -7703,7 +7707,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7703
7707
  GGML_ASSERT(false);
7704
7708
  }
7705
7709
 
7706
- const std::string piece = llama_token_to_str(ctx, token);
7710
+ const std::string piece = llama_token_to_piece(ctx, token);
7707
7711
 
7708
7712
  // Note terminating 0 in decoded string
7709
7713
  const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
@@ -8903,7 +8907,7 @@ struct llama_context * llama_new_context_with_model(
8903
8907
  // build worst-case graph
8904
8908
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
8905
8909
  int n_past = cparams.n_ctx - n_tokens;
8906
- llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
8910
+ llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
8907
8911
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
8908
8912
 
8909
8913
  #ifdef GGML_USE_METAL
@@ -9664,43 +9668,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
9664
9668
  return ctx->embedding.data();
9665
9669
  }
9666
9670
 
9667
- const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
9668
- return ctx->model.vocab.id_to_token[token].text.c_str();
9671
+ const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
9672
+ return model->vocab.id_to_token[token].text.c_str();
9669
9673
  }
9670
9674
 
9671
- float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
9672
- return ctx->model.vocab.id_to_token[token].score;
9675
+ float llama_token_get_score(const struct llama_model * model, llama_token token) {
9676
+ return model->vocab.id_to_token[token].score;
9673
9677
  }
9674
9678
 
9675
- llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
9676
- return ctx->model.vocab.id_to_token[token].type;
9679
+ llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
9680
+ return model->vocab.id_to_token[token].type;
9677
9681
  }
9678
9682
 
9679
- llama_token llama_token_bos(const struct llama_context * ctx) {
9680
- return ctx->model.vocab.special_bos_id;
9683
+ llama_token llama_token_bos(const struct llama_model * model) {
9684
+ return model->vocab.special_bos_id;
9681
9685
  }
9682
9686
 
9683
- llama_token llama_token_eos(const struct llama_context * ctx) {
9684
- return ctx->model.vocab.special_eos_id;
9687
+ llama_token llama_token_eos(const struct llama_model * model) {
9688
+ return model->vocab.special_eos_id;
9685
9689
  }
9686
9690
 
9687
- llama_token llama_token_nl(const struct llama_context * ctx) {
9688
- return ctx->model.vocab.linefeed_id;
9691
+ llama_token llama_token_nl(const struct llama_model * model) {
9692
+ return model->vocab.linefeed_id;
9689
9693
  }
9690
- llama_token llama_token_prefix(const struct llama_context * ctx) {
9691
- return ctx->model.vocab.special_prefix_id;
9694
+
9695
+ llama_token llama_token_prefix(const struct llama_model * model) {
9696
+ return model->vocab.special_prefix_id;
9692
9697
  }
9693
9698
 
9694
- llama_token llama_token_middle(const struct llama_context * ctx) {
9695
- return ctx->model.vocab.special_middle_id;
9699
+ llama_token llama_token_middle(const struct llama_model * model) {
9700
+ return model->vocab.special_middle_id;
9696
9701
  }
9697
9702
 
9698
- llama_token llama_token_suffix(const struct llama_context * ctx) {
9699
- return ctx->model.vocab.special_suffix_id;
9703
+ llama_token llama_token_suffix(const struct llama_model * model) {
9704
+ return model->vocab.special_suffix_id;
9700
9705
  }
9701
9706
 
9702
- llama_token llama_token_eot(const struct llama_context * ctx) {
9703
- return ctx->model.vocab.special_eot_id;
9707
+ llama_token llama_token_eot(const struct llama_model * model) {
9708
+ return model->vocab.special_eot_id;
9704
9709
  }
9705
9710
 
9706
9711
  int llama_tokenize(
@@ -494,21 +494,22 @@ extern "C" {
494
494
  // Vocab
495
495
  //
496
496
 
497
- LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
497
+ LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
498
498
 
499
- LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
499
+ LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
500
500
 
501
- LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
501
+ LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
502
502
 
503
503
  // Special tokens
504
- LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
505
- LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
506
- LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
504
+ LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
505
+ LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
506
+ LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
507
+
507
508
  // codellama infill tokens
508
- LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
509
- LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
510
- LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
511
- LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
509
+ LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
510
+ LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
511
+ LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
512
+ LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
512
513
 
513
514
  //
514
515
  // Tokenization
@@ -560,21 +561,15 @@ extern "C" {
560
561
  LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
561
562
 
562
563
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
563
- LLAMA_API void llama_sample_repetition_penalty(
564
- struct llama_context * ctx,
565
- llama_token_data_array * candidates,
566
- const llama_token * last_tokens,
567
- size_t last_tokens_size,
568
- float penalty);
569
-
570
564
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
571
- LLAMA_API void llama_sample_frequency_and_presence_penalties(
565
+ LLAMA_API void llama_sample_repetition_penalties(
572
566
  struct llama_context * ctx,
573
567
  llama_token_data_array * candidates,
574
568
  const llama_token * last_tokens,
575
- size_t last_tokens_size,
576
- float alpha_frequency,
577
- float alpha_presence);
569
+ size_t penalty_last_n,
570
+ float penalty_repeat,
571
+ float penalty_freq,
572
+ float penalty_present);
578
573
 
579
574
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
580
575
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.8.0'
6
+ VERSION = '0.9.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1405'
9
+ LLAMA_CPP_VERSION = 'b1429'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
67
67
 
68
68
  # apply penalties
69
69
  last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
70
- context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
71
- context.sample_frequency_and_presence_penalties(
72
- candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
70
+ context.sample_repetition_penalties(
71
+ candidates, last_n_tokens[-last_n_repeat..],
72
+ penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
73
73
  )
74
74
 
75
75
  # temperature sampling
@@ -97,7 +97,7 @@ module LLaMACpp
97
97
 
98
98
  embd.each { |token| output << context.model.token_to_piece(token) }
99
99
 
100
- break if !embd.empty? && embd[-1] == context.token_eos
100
+ break if !embd.empty? && embd[-1] == context.model.token_eos
101
101
  end
102
102
 
103
103
  output.join.scrub('?').strip.delete_prefix(prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -82,6 +82,16 @@ module LLaMACpp
82
82
  def desc: () -> String
83
83
  def size: () -> Integer
84
84
  def n_params: () -> Integer
85
+ def text: (Integer) -> String
86
+ def score: (Integer) -> Float
87
+ def type: (Integer) -> Integer
88
+ def token_bos: () -> Integer
89
+ def token_eos: () -> Integer
90
+ def token_nl: () -> Integer
91
+ def token_prefix: () -> Integer
92
+ def token_middle: () -> Integer
93
+ def token_suffix: () -> Integer
94
+ def token_eot: () -> Integer
85
95
  end
86
96
 
87
97
  class Timings
@@ -143,16 +153,6 @@ module LLaMACpp
143
153
 
144
154
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
145
155
  def embeddings: () -> Array[Float]
146
- def text: (Integer) -> String
147
- def score: (Integer) -> Float
148
- def type: (Integer) -> Integer
149
- def token_bos: () -> Integer
150
- def token_eos: () -> Integer
151
- def token_nl: () -> Integer
152
- def token_prefix: () -> Integer
153
- def token_middle: () -> Integer
154
- def token_suffix: () -> Integer
155
- def token_eot: () -> Integer
156
156
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
157
157
  def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
158
158
  def decode: (::LLaMACpp::Batch) -> void
@@ -170,8 +170,7 @@ module LLaMACpp
170
170
  def set_rng_seed: (Integer) -> void
171
171
  def load_session_file: (session_path: String) -> void
172
172
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
173
- def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
174
- def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
173
+ def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
175
174
  def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
176
175
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
177
176
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-21 00:00:00.000000000 Z
11
+ date: 2023-10-28 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: