llama_cpp 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
101
101
  // memory sizes
102
102
  //
103
103
 
104
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
104
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
105
  {
106
106
  static std::map<e_model, size_t> k_sizes = {
107
- { MODEL_3B, 256ull * MB },
108
- { MODEL_7B, 512ull * MB },
109
- { MODEL_13B, 512ull * MB },
110
- { MODEL_30B, 512ull * MB },
111
- { MODEL_65B, 1024ull * MB },
107
+ /* empirical scaling, still a guess */
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
112
113
  };
113
114
  return k_sizes;
114
115
  }
@@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
140
141
 
141
142
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
142
143
  // not actually needed if BLAS is disabled
143
- static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
144
145
  {
145
146
  static std::map<e_model, size_t> k_sizes = {
146
- { MODEL_3B, 512ull * MB },
147
- { MODEL_7B, 768ull * MB },
148
- { MODEL_13B, 1024ull * MB },
149
- { MODEL_30B, 1280ull * MB },
150
- { MODEL_65B, 1536ull * MB },
147
+ { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
+ { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
+ { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
+ { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
+ { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
151
152
  };
152
153
  return k_sizes;
153
154
  }
@@ -189,6 +190,10 @@ struct llama_hparams {
189
190
  uint32_t n_head = 32;
190
191
  uint32_t n_layer = 32;
191
192
  uint32_t n_rot = 64;
193
+
194
+ float rope_freq_base = 10000.0f;
195
+ float rope_freq_scale = 1.0f;
196
+
192
197
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
193
198
 
194
199
  bool operator!=(const llama_hparams & other) const {
@@ -303,7 +308,7 @@ struct llama_model {
303
308
  };
304
309
 
305
310
  struct llama_context {
306
- llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
311
+ llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
307
312
  #ifdef GGML_USE_METAL
308
313
  ~llama_context() {
309
314
  if (ctx_metal) {
@@ -324,7 +329,6 @@ struct llama_context {
324
329
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
325
330
 
326
331
  const llama_model & model;
327
- const llama_vocab & vocab;
328
332
 
329
333
  bool model_owner = false;
330
334
 
@@ -551,7 +555,9 @@ struct llama_file_loader {
551
555
  }
552
556
 
553
557
  // skip to the next multiple of 32 bytes
554
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
558
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
559
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
560
+ }
555
561
 
556
562
  tensor.file_off = file.tell();
557
563
  tensor.name = name;
@@ -648,7 +654,7 @@ struct llama_model_loader {
648
654
  *ctx_size_p = *mmapped_size_p = 0;
649
655
  for (const llama_load_tensor & lt : tensors_map.tensors) {
650
656
  *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
651
- *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
657
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
652
658
  }
653
659
  }
654
660
 
@@ -843,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
843
849
  /*.n_batch =*/ 512,
844
850
  /*.gpu_layers =*/ 0,
845
851
  /*.main_gpu =*/ 0,
846
- /*.tensor_split =*/ {0},
852
+ /*.tensor_split =*/ nullptr,
853
+ /*.rope_freq_base =*/ 10000.0f,
854
+ /*.rope_freq_scale =*/ 1.0f,
847
855
  /*.progress_callback =*/ nullptr,
848
856
  /*.progress_callback_user_data =*/ nullptr,
849
857
  /*.low_vram =*/ false,
@@ -869,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
869
877
  return result;
870
878
  }
871
879
 
880
+ int llama_max_devices() {
881
+ return LLAMA_MAX_DEVICES;
882
+ }
883
+
872
884
  bool llama_mmap_supported() {
873
885
  return llama_mmap::SUPPORTED;
874
886
  }
@@ -967,6 +979,8 @@ static void llama_model_load_internal(
967
979
  int n_gpu_layers,
968
980
  int main_gpu,
969
981
  const float * tensor_split,
982
+ float rope_freq_base,
983
+ float rope_freq_scale,
970
984
  bool low_vram,
971
985
  ggml_type memory_type,
972
986
  bool use_mmap,
@@ -1001,22 +1015,27 @@ static void llama_model_load_internal(
1001
1015
  }
1002
1016
 
1003
1017
  hparams.n_ctx = n_ctx;
1018
+
1019
+ hparams.rope_freq_base = rope_freq_base;
1020
+ hparams.rope_freq_scale = rope_freq_scale;
1004
1021
  }
1005
1022
 
1006
1023
  const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1007
1024
 
1008
1025
  {
1009
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1010
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1011
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1012
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1013
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1014
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1015
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1016
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1026
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1027
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1028
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1029
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1032
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1034
+ fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
+ fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1017
1036
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1018
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1019
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1037
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1020
1039
  }
1021
1040
 
1022
1041
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1165,9 +1184,9 @@ static void llama_model_load_internal(
1165
1184
  const size_t mem_required =
1166
1185
  ctx_size +
1167
1186
  mmapped_size - vram_weights + // weights in VRAM not in memory
1168
- MEM_REQ_SCRATCH0().at(model.type) +
1187
+ MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1169
1188
  MEM_REQ_SCRATCH1().at(model.type) +
1170
- MEM_REQ_EVAL().at (model.type);
1189
+ MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1171
1190
 
1172
1191
  // this is the memory required by one llama_state
1173
1192
  const size_t mem_required_state =
@@ -1270,7 +1289,9 @@ static bool llama_model_load(
1270
1289
  int n_batch,
1271
1290
  int n_gpu_layers,
1272
1291
  int main_gpu,
1273
- float * tensor_split,
1292
+ const float * tensor_split,
1293
+ float rope_freq_base,
1294
+ float rope_freq_scale,
1274
1295
  bool low_vram,
1275
1296
  ggml_type memory_type,
1276
1297
  bool use_mmap,
@@ -1279,7 +1300,7 @@ static bool llama_model_load(
1279
1300
  llama_progress_callback progress_callback,
1280
1301
  void *progress_callback_user_data) {
1281
1302
  try {
1282
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1303
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1283
1304
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1284
1305
  return true;
1285
1306
  } catch (const std::exception & err) {
@@ -1331,6 +1352,9 @@ static bool llama_eval_internal(
1331
1352
  const int n_rot = hparams.n_embd/hparams.n_head;
1332
1353
  const int n_gpu_layers = model.n_gpu_layers;
1333
1354
 
1355
+ const float freq_base = hparams.rope_freq_base;
1356
+ const float freq_scale = hparams.rope_freq_scale;
1357
+
1334
1358
  auto & mem_per_token = lctx.mem_per_token;
1335
1359
  auto & buf_compute = lctx.buf_compute;
1336
1360
 
@@ -1428,11 +1452,11 @@ static bool llama_eval_internal(
1428
1452
  offload_func_kq(tmpq);
1429
1453
  ggml_set_name(tmpq, "tmpq");
1430
1454
 
1431
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1455
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1432
1456
  offload_func_kq(Kcur);
1433
1457
  ggml_set_name(Kcur, "Kcur");
1434
1458
 
1435
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1459
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1436
1460
  offload_func_kq(Qcur);
1437
1461
  ggml_set_name(Qcur, "Qcur");
1438
1462
 
@@ -2006,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
2006
2030
  }
2007
2031
 
2008
2032
  // Normalize the second derivatives
2009
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2010
- for (float & value : second_derivatives) {
2011
- value /= second_derivatives_sum;
2033
+ {
2034
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2035
+
2036
+ if (second_derivatives_sum > 1e-6f) {
2037
+ for (float & value : second_derivatives) {
2038
+ value /= second_derivatives_sum;
2039
+ }
2040
+ } else {
2041
+ for (float & value : second_derivatives) {
2042
+ value = 1.0f / second_derivatives.size();
2043
+ }
2044
+ }
2012
2045
  }
2013
2046
 
2014
2047
  float cum_sum = 0.0f;
@@ -2185,9 +2218,8 @@ void llama_sample_classifier_free_guidance(
2185
2218
  struct llama_context * ctx,
2186
2219
  llama_token_data_array * candidates,
2187
2220
  struct llama_context * guidance_ctx,
2188
- float scale,
2189
- float smooth_factor) {
2190
- int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2221
+ float scale) {
2222
+ int64_t t_start_sample_us = ggml_time_us();
2191
2223
 
2192
2224
  assert(ctx);
2193
2225
  auto n_vocab = llama_n_vocab(ctx);
@@ -2207,16 +2239,7 @@ void llama_sample_classifier_free_guidance(
2207
2239
  for (int i = 0; i < n_vocab; ++i) {
2208
2240
  float logit_guidance = logits_guidance[i];
2209
2241
  float logit_base = logits_base[i];
2210
- logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
- }
2212
-
2213
- llama_log_softmax(logits_guidance, n_vocab);
2214
-
2215
- for (int i = 0; i < n_vocab; ++i) {
2216
- float logit_base = logits_base[i];
2217
- float logit_guidance = logits_guidance[i];
2218
-
2219
- candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2242
+ candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
2220
2243
  }
2221
2244
 
2222
2245
  if (ctx) {
@@ -2675,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
2675
2698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2676
2699
 
2677
2700
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2678
- params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2679
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2701
+ params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
+ memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
+ params.progress_callback_user_data)) {
2680
2704
  delete model;
2681
2705
  fprintf(stderr, "%s: failed to load model\n", __func__);
2682
2706
  return nullptr;
@@ -2697,7 +2721,7 @@ struct llama_context * llama_new_context_with_model(
2697
2721
  return nullptr;
2698
2722
  }
2699
2723
 
2700
- llama_context * ctx = new llama_context(*model, model->vocab);
2724
+ llama_context * ctx = new llama_context(*model);
2701
2725
 
2702
2726
  if (params.seed == LLAMA_DEFAULT_SEED) {
2703
2727
  params.seed = time(NULL);
@@ -2751,9 +2775,9 @@ struct llama_context * llama_new_context_with_model(
2751
2775
  ctx->embedding.resize(hparams.n_embd);
2752
2776
  }
2753
2777
 
2754
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2778
+ ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
2755
2779
 
2756
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
2780
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2757
2781
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2758
2782
  }
2759
2783
 
@@ -3535,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3535
3559
  return 0;
3536
3560
  }
3537
3561
 
3538
- int llama_tokenize(
3539
- struct llama_context * ctx,
3562
+ int llama_tokenize_with_model(
3563
+ const struct llama_model * model,
3540
3564
  const char * text,
3541
3565
  llama_token * tokens,
3542
3566
  int n_max_tokens,
3543
3567
  bool add_bos) {
3544
- auto res = llama_tokenize(ctx->vocab, text, add_bos);
3568
+ auto res = llama_tokenize(model->vocab, text, add_bos);
3545
3569
 
3546
3570
  if (n_max_tokens < (int) res.size()) {
3547
3571
  fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3555,8 +3579,29 @@ int llama_tokenize(
3555
3579
  return res.size();
3556
3580
  }
3557
3581
 
3582
+ int llama_tokenize(
3583
+ struct llama_context * ctx,
3584
+ const char * text,
3585
+ llama_token * tokens,
3586
+ int n_max_tokens,
3587
+ bool add_bos) {
3588
+ return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
3589
+ }
3590
+
3591
+ int llama_n_vocab_from_model(const struct llama_model * model) {
3592
+ return model->vocab.id_to_token.size();
3593
+ }
3594
+
3595
+ int llama_n_ctx_from_model(const struct llama_model * model) {
3596
+ return model->hparams.n_ctx;
3597
+ }
3598
+
3599
+ int llama_n_embd_from_model(const struct llama_model * model) {
3600
+ return model->hparams.n_embd;
3601
+ }
3602
+
3558
3603
  int llama_n_vocab(const struct llama_context * ctx) {
3559
- return ctx->vocab.id_to_token.size();
3604
+ return ctx->model.vocab.id_to_token.size();
3560
3605
  }
3561
3606
 
3562
3607
  int llama_n_ctx(const struct llama_context * ctx) {
@@ -3567,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
3567
3612
  return ctx->model.hparams.n_embd;
3568
3613
  }
3569
3614
 
3570
- int llama_get_vocab(
3571
- const struct llama_context * ctx,
3615
+ int llama_get_vocab_from_model(
3616
+ const struct llama_model * model,
3572
3617
  const char * * strings,
3573
3618
  float * scores,
3574
3619
  int capacity) {
3575
- int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3620
+ int n = std::min(capacity, (int) model->vocab.id_to_token.size());
3576
3621
  for (int i = 0; i<n; ++i) {
3577
- strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3578
- scores[i] = ctx->vocab.id_to_token[i].score;
3622
+ strings[i] = model->vocab.id_to_token[i].tok.c_str();
3623
+ scores[i] = model->vocab.id_to_token[i].score;
3579
3624
  }
3580
3625
  return n;
3581
3626
  }
3582
3627
 
3628
+ int llama_get_vocab(
3629
+ const struct llama_context * ctx,
3630
+ const char * * strings,
3631
+ float * scores,
3632
+ int capacity) {
3633
+ return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
3634
+ }
3635
+
3583
3636
  float * llama_get_logits(struct llama_context * ctx) {
3584
3637
  return ctx->logits.data();
3585
3638
  }
@@ -3588,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
3588
3641
  return ctx->embedding.data();
3589
3642
  }
3590
3643
 
3591
- const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3592
- if (token >= llama_n_vocab(ctx)) {
3644
+ const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
3645
+ if (token >= llama_n_vocab_from_model(model)) {
3593
3646
  return nullptr;
3594
3647
  }
3595
3648
 
3596
- return ctx->vocab.id_to_token[token].tok.c_str();
3649
+ return model->vocab.id_to_token[token].tok.c_str();
3650
+ }
3651
+
3652
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3653
+ return llama_token_to_str_with_model(&ctx->model, token);
3597
3654
  }
3598
3655
 
3599
3656
  llama_token llama_token_bos() {
@@ -88,7 +88,13 @@ extern "C" {
88
88
  int32_t n_batch; // prompt processing batch size
89
89
  int32_t n_gpu_layers; // number of layers to store in VRAM
90
90
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
91
+
92
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
+
94
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
95
+ float rope_freq_base; // RoPE base frequency
96
+ float rope_freq_scale; // RoPE frequency scaling factor
97
+
92
98
  // called with a progress value between 0 and 1, pass NULL to disable
93
99
  llama_progress_callback progress_callback;
94
100
  // context pointer passed to the progress callback
@@ -148,6 +154,8 @@ extern "C" {
148
154
  int32_t n_eval;
149
155
  };
150
156
 
157
+ LLAMA_API int llama_max_devices();
158
+
151
159
  LLAMA_API struct llama_context_params llama_context_default_params();
152
160
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
153
161
 
@@ -270,10 +278,21 @@ extern "C" {
270
278
  int n_max_tokens,
271
279
  bool add_bos);
272
280
 
281
+ LLAMA_API int llama_tokenize_with_model(
282
+ const struct llama_model * model,
283
+ const char * text,
284
+ llama_token * tokens,
285
+ int n_max_tokens,
286
+ bool add_bos);
287
+
273
288
  LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
274
289
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
275
290
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
276
291
 
292
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
293
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
294
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
295
+
277
296
  // Get the vocabulary as output parameters.
278
297
  // Returns number of results.
279
298
  LLAMA_API int llama_get_vocab(
@@ -282,6 +301,12 @@ extern "C" {
282
301
  float * scores,
283
302
  int capacity);
284
303
 
304
+ LLAMA_API int llama_get_vocab_from_model(
305
+ const struct llama_model * model,
306
+ const char * * strings,
307
+ float * scores,
308
+ int capacity);
309
+
285
310
  // Token logits obtained from the last call to llama_eval()
286
311
  // The logits for the last token are stored in the last row
287
312
  // Can be mutated in order to change the probabilities of the next token
@@ -294,7 +319,13 @@ extern "C" {
294
319
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
295
320
 
296
321
  // Token Id -> String. Uses the vocabulary in the provided context
297
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
322
+ LLAMA_API const char * llama_token_to_str(
323
+ const struct llama_context * ctx,
324
+ llama_token token);
325
+
326
+ LLAMA_API const char * llama_token_to_str_with_model(
327
+ const struct llama_model * model,
328
+ llama_token token);
298
329
 
299
330
  // Special tokens
300
331
  LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
@@ -313,13 +344,11 @@ extern "C" {
313
344
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
345
  /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
346
  /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
- /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
347
  LLAMA_API void llama_sample_classifier_free_guidance(
318
348
  struct llama_context * ctx,
319
349
  llama_token_data_array * candidates,
320
350
  struct llama_context * guidance_ctx,
321
- float scale,
322
- float smooth_factor);
351
+ float scale);
323
352
 
324
353
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
325
354
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.3'
6
+ VERSION = '0.3.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-32c5411'
9
+ LLAMA_CPP_VERSION = 'master-d924522'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -109,3 +109,4 @@ module LLaMACpp
109
109
  end
110
110
 
111
111
  LLaMACpp.backend_init
112
+ at_exit { LLaMACpp.backend_free }
data/sig/llama_cpp.rbs CHANGED
@@ -39,6 +39,7 @@ module LLaMACpp
39
39
  def self?.token_nl: () -> Integer
40
40
  def self?.mmap_supported?: () -> bool
41
41
  def self?.mlock_supported?: () -> bool
42
+ def self?.max_devices: () -> Integer
42
43
 
43
44
  class TokenData
44
45
  public
@@ -69,6 +70,12 @@ module LLaMACpp
69
70
  def free: () -> void
70
71
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
71
72
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
73
+ def n_vocab: () -> Integer
74
+ def n_ctx: () -> Integer
75
+ def n_embd: () -> Integer
76
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
77
+ def token_to_str: (Integer) -> String
78
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
72
79
  end
73
80
 
74
81
  class Timings
@@ -109,7 +116,7 @@ module LLaMACpp
109
116
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
110
117
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
111
118
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
119
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
113
120
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
114
121
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
115
122
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -140,6 +147,10 @@ module LLaMACpp
140
147
  def main_gpu: () -> Integer
141
148
  def main_gpu=: (Integer) -> Integer
142
149
  def tensor_split: () -> Array[Float]
150
+ def rope_freq_base=: (Float) -> Float
151
+ def rope_freq_base: () -> Float
152
+ def rope_freq_scale=: (Float) -> Float
153
+ def rope_freq_scale: () -> Float
143
154
  def low_vram: () -> bool
144
155
  def low_vram=: (bool) -> bool
145
156
  def seed: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-15 00:00:00.000000000 Z
11
+ date: 2023-07-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: