llama_cpp 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
101
101
  // memory sizes
102
102
  //
103
103
 
104
- static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
104
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105
105
  {
106
106
  static std::map<e_model, size_t> k_sizes = {
107
- { MODEL_3B, 256ull * MB },
108
- { MODEL_7B, 512ull * MB },
109
- { MODEL_13B, 512ull * MB },
110
- { MODEL_30B, 512ull * MB },
111
- { MODEL_65B, 1024ull * MB },
107
+ /* empirical scaling, still a guess */
108
+ { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109
+ { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110
+ { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111
+ { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112
+ { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
112
113
  };
113
114
  return k_sizes;
114
115
  }
@@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
140
141
 
141
142
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
142
143
  // not actually needed if BLAS is disabled
143
- static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
144
145
  {
145
146
  static std::map<e_model, size_t> k_sizes = {
146
- { MODEL_3B, 512ull * MB },
147
- { MODEL_7B, 768ull * MB },
148
- { MODEL_13B, 1024ull * MB },
149
- { MODEL_30B, 1280ull * MB },
150
- { MODEL_65B, 1536ull * MB },
147
+ { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148
+ { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149
+ { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150
+ { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151
+ { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
151
152
  };
152
153
  return k_sizes;
153
154
  }
@@ -189,6 +190,10 @@ struct llama_hparams {
189
190
  uint32_t n_head = 32;
190
191
  uint32_t n_layer = 32;
191
192
  uint32_t n_rot = 64;
193
+
194
+ float rope_freq_base = 10000.0f;
195
+ float rope_freq_scale = 1.0f;
196
+
192
197
  enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
193
198
 
194
199
  bool operator!=(const llama_hparams & other) const {
@@ -303,7 +308,7 @@ struct llama_model {
303
308
  };
304
309
 
305
310
  struct llama_context {
306
- llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
311
+ llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
307
312
  #ifdef GGML_USE_METAL
308
313
  ~llama_context() {
309
314
  if (ctx_metal) {
@@ -324,7 +329,6 @@ struct llama_context {
324
329
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
325
330
 
326
331
  const llama_model & model;
327
- const llama_vocab & vocab;
328
332
 
329
333
  bool model_owner = false;
330
334
 
@@ -551,7 +555,9 @@ struct llama_file_loader {
551
555
  }
552
556
 
553
557
  // skip to the next multiple of 32 bytes
554
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
558
+ if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
559
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
560
+ }
555
561
 
556
562
  tensor.file_off = file.tell();
557
563
  tensor.name = name;
@@ -648,7 +654,7 @@ struct llama_model_loader {
648
654
  *ctx_size_p = *mmapped_size_p = 0;
649
655
  for (const llama_load_tensor & lt : tensors_map.tensors) {
650
656
  *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
651
- *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
657
+ *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
652
658
  }
653
659
  }
654
660
 
@@ -843,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
843
849
  /*.n_batch =*/ 512,
844
850
  /*.gpu_layers =*/ 0,
845
851
  /*.main_gpu =*/ 0,
846
- /*.tensor_split =*/ {0},
852
+ /*.tensor_split =*/ nullptr,
853
+ /*.rope_freq_base =*/ 10000.0f,
854
+ /*.rope_freq_scale =*/ 1.0f,
847
855
  /*.progress_callback =*/ nullptr,
848
856
  /*.progress_callback_user_data =*/ nullptr,
849
857
  /*.low_vram =*/ false,
@@ -869,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
869
877
  return result;
870
878
  }
871
879
 
880
+ int llama_max_devices() {
881
+ return LLAMA_MAX_DEVICES;
882
+ }
883
+
872
884
  bool llama_mmap_supported() {
873
885
  return llama_mmap::SUPPORTED;
874
886
  }
@@ -967,6 +979,8 @@ static void llama_model_load_internal(
967
979
  int n_gpu_layers,
968
980
  int main_gpu,
969
981
  const float * tensor_split,
982
+ float rope_freq_base,
983
+ float rope_freq_scale,
970
984
  bool low_vram,
971
985
  ggml_type memory_type,
972
986
  bool use_mmap,
@@ -1001,22 +1015,27 @@ static void llama_model_load_internal(
1001
1015
  }
1002
1016
 
1003
1017
  hparams.n_ctx = n_ctx;
1018
+
1019
+ hparams.rope_freq_base = rope_freq_base;
1020
+ hparams.rope_freq_scale = rope_freq_scale;
1004
1021
  }
1005
1022
 
1006
1023
  const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
1007
1024
 
1008
1025
  {
1009
- fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1010
- fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1011
- fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1012
- fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1013
- fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1014
- fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1015
- fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1016
- fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1026
+ fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
1027
+ fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1028
+ fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1029
+ fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
1030
+ fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
1031
+ fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
1032
+ fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
1033
+ fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1034
+ fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1035
+ fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1017
1036
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1018
- fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1019
- fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1037
+ fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1038
+ fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1020
1039
  }
1021
1040
 
1022
1041
  if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1165,9 +1184,9 @@ static void llama_model_load_internal(
1165
1184
  const size_t mem_required =
1166
1185
  ctx_size +
1167
1186
  mmapped_size - vram_weights + // weights in VRAM not in memory
1168
- MEM_REQ_SCRATCH0().at(model.type) +
1187
+ MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1169
1188
  MEM_REQ_SCRATCH1().at(model.type) +
1170
- MEM_REQ_EVAL().at (model.type);
1189
+ MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1171
1190
 
1172
1191
  // this is the memory required by one llama_state
1173
1192
  const size_t mem_required_state =
@@ -1270,7 +1289,9 @@ static bool llama_model_load(
1270
1289
  int n_batch,
1271
1290
  int n_gpu_layers,
1272
1291
  int main_gpu,
1273
- float * tensor_split,
1292
+ const float * tensor_split,
1293
+ float rope_freq_base,
1294
+ float rope_freq_scale,
1274
1295
  bool low_vram,
1275
1296
  ggml_type memory_type,
1276
1297
  bool use_mmap,
@@ -1279,7 +1300,7 @@ static bool llama_model_load(
1279
1300
  llama_progress_callback progress_callback,
1280
1301
  void *progress_callback_user_data) {
1281
1302
  try {
1282
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1303
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1283
1304
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1284
1305
  return true;
1285
1306
  } catch (const std::exception & err) {
@@ -1331,6 +1352,9 @@ static bool llama_eval_internal(
1331
1352
  const int n_rot = hparams.n_embd/hparams.n_head;
1332
1353
  const int n_gpu_layers = model.n_gpu_layers;
1333
1354
 
1355
+ const float freq_base = hparams.rope_freq_base;
1356
+ const float freq_scale = hparams.rope_freq_scale;
1357
+
1334
1358
  auto & mem_per_token = lctx.mem_per_token;
1335
1359
  auto & buf_compute = lctx.buf_compute;
1336
1360
 
@@ -1428,11 +1452,11 @@ static bool llama_eval_internal(
1428
1452
  offload_func_kq(tmpq);
1429
1453
  ggml_set_name(tmpq, "tmpq");
1430
1454
 
1431
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1455
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1432
1456
  offload_func_kq(Kcur);
1433
1457
  ggml_set_name(Kcur, "Kcur");
1434
1458
 
1435
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
1459
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
1436
1460
  offload_func_kq(Qcur);
1437
1461
  ggml_set_name(Qcur, "Qcur");
1438
1462
 
@@ -2006,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
2006
2030
  }
2007
2031
 
2008
2032
  // Normalize the second derivatives
2009
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2010
- for (float & value : second_derivatives) {
2011
- value /= second_derivatives_sum;
2033
+ {
2034
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2035
+
2036
+ if (second_derivatives_sum > 1e-6f) {
2037
+ for (float & value : second_derivatives) {
2038
+ value /= second_derivatives_sum;
2039
+ }
2040
+ } else {
2041
+ for (float & value : second_derivatives) {
2042
+ value = 1.0f / second_derivatives.size();
2043
+ }
2044
+ }
2012
2045
  }
2013
2046
 
2014
2047
  float cum_sum = 0.0f;
@@ -2185,9 +2218,8 @@ void llama_sample_classifier_free_guidance(
2185
2218
  struct llama_context * ctx,
2186
2219
  llama_token_data_array * candidates,
2187
2220
  struct llama_context * guidance_ctx,
2188
- float scale,
2189
- float smooth_factor) {
2190
- int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2221
+ float scale) {
2222
+ int64_t t_start_sample_us = ggml_time_us();
2191
2223
 
2192
2224
  assert(ctx);
2193
2225
  auto n_vocab = llama_n_vocab(ctx);
@@ -2207,16 +2239,7 @@ void llama_sample_classifier_free_guidance(
2207
2239
  for (int i = 0; i < n_vocab; ++i) {
2208
2240
  float logit_guidance = logits_guidance[i];
2209
2241
  float logit_base = logits_base[i];
2210
- logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2211
- }
2212
-
2213
- llama_log_softmax(logits_guidance, n_vocab);
2214
-
2215
- for (int i = 0; i < n_vocab; ++i) {
2216
- float logit_base = logits_base[i];
2217
- float logit_guidance = logits_guidance[i];
2218
-
2219
- candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2242
+ candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
2220
2243
  }
2221
2244
 
2222
2245
  if (ctx) {
@@ -2675,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
2675
2698
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2676
2699
 
2677
2700
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2678
- params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2679
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2701
+ params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
2702
+ memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
2703
+ params.progress_callback_user_data)) {
2680
2704
  delete model;
2681
2705
  fprintf(stderr, "%s: failed to load model\n", __func__);
2682
2706
  return nullptr;
@@ -2697,7 +2721,7 @@ struct llama_context * llama_new_context_with_model(
2697
2721
  return nullptr;
2698
2722
  }
2699
2723
 
2700
- llama_context * ctx = new llama_context(*model, model->vocab);
2724
+ llama_context * ctx = new llama_context(*model);
2701
2725
 
2702
2726
  if (params.seed == LLAMA_DEFAULT_SEED) {
2703
2727
  params.seed = time(NULL);
@@ -2751,9 +2775,9 @@ struct llama_context * llama_new_context_with_model(
2751
2775
  ctx->embedding.resize(hparams.n_embd);
2752
2776
  }
2753
2777
 
2754
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2778
+ ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
2755
2779
 
2756
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
2780
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
2757
2781
  ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2758
2782
  }
2759
2783
 
@@ -3535,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3535
3559
  return 0;
3536
3560
  }
3537
3561
 
3538
- int llama_tokenize(
3539
- struct llama_context * ctx,
3562
+ int llama_tokenize_with_model(
3563
+ const struct llama_model * model,
3540
3564
  const char * text,
3541
3565
  llama_token * tokens,
3542
3566
  int n_max_tokens,
3543
3567
  bool add_bos) {
3544
- auto res = llama_tokenize(ctx->vocab, text, add_bos);
3568
+ auto res = llama_tokenize(model->vocab, text, add_bos);
3545
3569
 
3546
3570
  if (n_max_tokens < (int) res.size()) {
3547
3571
  fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3555,8 +3579,29 @@ int llama_tokenize(
3555
3579
  return res.size();
3556
3580
  }
3557
3581
 
3582
+ int llama_tokenize(
3583
+ struct llama_context * ctx,
3584
+ const char * text,
3585
+ llama_token * tokens,
3586
+ int n_max_tokens,
3587
+ bool add_bos) {
3588
+ return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
3589
+ }
3590
+
3591
+ int llama_n_vocab_from_model(const struct llama_model * model) {
3592
+ return model->vocab.id_to_token.size();
3593
+ }
3594
+
3595
+ int llama_n_ctx_from_model(const struct llama_model * model) {
3596
+ return model->hparams.n_ctx;
3597
+ }
3598
+
3599
+ int llama_n_embd_from_model(const struct llama_model * model) {
3600
+ return model->hparams.n_embd;
3601
+ }
3602
+
3558
3603
  int llama_n_vocab(const struct llama_context * ctx) {
3559
- return ctx->vocab.id_to_token.size();
3604
+ return ctx->model.vocab.id_to_token.size();
3560
3605
  }
3561
3606
 
3562
3607
  int llama_n_ctx(const struct llama_context * ctx) {
@@ -3567,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
3567
3612
  return ctx->model.hparams.n_embd;
3568
3613
  }
3569
3614
 
3570
- int llama_get_vocab(
3571
- const struct llama_context * ctx,
3615
+ int llama_get_vocab_from_model(
3616
+ const struct llama_model * model,
3572
3617
  const char * * strings,
3573
3618
  float * scores,
3574
3619
  int capacity) {
3575
- int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
3620
+ int n = std::min(capacity, (int) model->vocab.id_to_token.size());
3576
3621
  for (int i = 0; i<n; ++i) {
3577
- strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
3578
- scores[i] = ctx->vocab.id_to_token[i].score;
3622
+ strings[i] = model->vocab.id_to_token[i].tok.c_str();
3623
+ scores[i] = model->vocab.id_to_token[i].score;
3579
3624
  }
3580
3625
  return n;
3581
3626
  }
3582
3627
 
3628
+ int llama_get_vocab(
3629
+ const struct llama_context * ctx,
3630
+ const char * * strings,
3631
+ float * scores,
3632
+ int capacity) {
3633
+ return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
3634
+ }
3635
+
3583
3636
  float * llama_get_logits(struct llama_context * ctx) {
3584
3637
  return ctx->logits.data();
3585
3638
  }
@@ -3588,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
3588
3641
  return ctx->embedding.data();
3589
3642
  }
3590
3643
 
3591
- const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3592
- if (token >= llama_n_vocab(ctx)) {
3644
+ const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
3645
+ if (token >= llama_n_vocab_from_model(model)) {
3593
3646
  return nullptr;
3594
3647
  }
3595
3648
 
3596
- return ctx->vocab.id_to_token[token].tok.c_str();
3649
+ return model->vocab.id_to_token[token].tok.c_str();
3650
+ }
3651
+
3652
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
3653
+ return llama_token_to_str_with_model(&ctx->model, token);
3597
3654
  }
3598
3655
 
3599
3656
  llama_token llama_token_bos() {
@@ -88,7 +88,13 @@ extern "C" {
88
88
  int32_t n_batch; // prompt processing batch size
89
89
  int32_t n_gpu_layers; // number of layers to store in VRAM
90
90
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
91
+
92
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
+
94
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
95
+ float rope_freq_base; // RoPE base frequency
96
+ float rope_freq_scale; // RoPE frequency scaling factor
97
+
92
98
  // called with a progress value between 0 and 1, pass NULL to disable
93
99
  llama_progress_callback progress_callback;
94
100
  // context pointer passed to the progress callback
@@ -148,6 +154,8 @@ extern "C" {
148
154
  int32_t n_eval;
149
155
  };
150
156
 
157
+ LLAMA_API int llama_max_devices();
158
+
151
159
  LLAMA_API struct llama_context_params llama_context_default_params();
152
160
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
153
161
 
@@ -270,10 +278,21 @@ extern "C" {
270
278
  int n_max_tokens,
271
279
  bool add_bos);
272
280
 
281
+ LLAMA_API int llama_tokenize_with_model(
282
+ const struct llama_model * model,
283
+ const char * text,
284
+ llama_token * tokens,
285
+ int n_max_tokens,
286
+ bool add_bos);
287
+
273
288
  LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
274
289
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
275
290
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
276
291
 
292
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
293
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
294
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
295
+
277
296
  // Get the vocabulary as output parameters.
278
297
  // Returns number of results.
279
298
  LLAMA_API int llama_get_vocab(
@@ -282,6 +301,12 @@ extern "C" {
282
301
  float * scores,
283
302
  int capacity);
284
303
 
304
+ LLAMA_API int llama_get_vocab_from_model(
305
+ const struct llama_model * model,
306
+ const char * * strings,
307
+ float * scores,
308
+ int capacity);
309
+
285
310
  // Token logits obtained from the last call to llama_eval()
286
311
  // The logits for the last token are stored in the last row
287
312
  // Can be mutated in order to change the probabilities of the next token
@@ -294,7 +319,13 @@ extern "C" {
294
319
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
295
320
 
296
321
  // Token Id -> String. Uses the vocabulary in the provided context
297
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
322
+ LLAMA_API const char * llama_token_to_str(
323
+ const struct llama_context * ctx,
324
+ llama_token token);
325
+
326
+ LLAMA_API const char * llama_token_to_str_with_model(
327
+ const struct llama_model * model,
328
+ llama_token token);
298
329
 
299
330
  // Special tokens
300
331
  LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
@@ -313,13 +344,11 @@ extern "C" {
313
344
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
345
  /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
346
  /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
- /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
347
  LLAMA_API void llama_sample_classifier_free_guidance(
318
348
  struct llama_context * ctx,
319
349
  llama_token_data_array * candidates,
320
350
  struct llama_context * guidance_ctx,
321
- float scale,
322
- float smooth_factor);
351
+ float scale);
323
352
 
324
353
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
325
354
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.3'
6
+ VERSION = '0.3.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-32c5411'
9
+ LLAMA_CPP_VERSION = 'master-d924522'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -109,3 +109,4 @@ module LLaMACpp
109
109
  end
110
110
 
111
111
  LLaMACpp.backend_init
112
+ at_exit { LLaMACpp.backend_free }
data/sig/llama_cpp.rbs CHANGED
@@ -39,6 +39,7 @@ module LLaMACpp
39
39
  def self?.token_nl: () -> Integer
40
40
  def self?.mmap_supported?: () -> bool
41
41
  def self?.mlock_supported?: () -> bool
42
+ def self?.max_devices: () -> Integer
42
43
 
43
44
  class TokenData
44
45
  public
@@ -69,6 +70,12 @@ module LLaMACpp
69
70
  def free: () -> void
70
71
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
71
72
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
73
+ def n_vocab: () -> Integer
74
+ def n_ctx: () -> Integer
75
+ def n_embd: () -> Integer
76
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
77
+ def token_to_str: (Integer) -> String
78
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
72
79
  end
73
80
 
74
81
  class Timings
@@ -109,7 +116,7 @@ module LLaMACpp
109
116
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
110
117
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
111
118
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
119
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
113
120
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
114
121
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
115
122
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -140,6 +147,10 @@ module LLaMACpp
140
147
  def main_gpu: () -> Integer
141
148
  def main_gpu=: (Integer) -> Integer
142
149
  def tensor_split: () -> Array[Float]
150
+ def rope_freq_base=: (Float) -> Float
151
+ def rope_freq_base: () -> Float
152
+ def rope_freq_scale=: (Float) -> Float
153
+ def rope_freq_scale: () -> Float
143
154
  def low_vram: () -> bool
144
155
  def low_vram=: (bool) -> bool
145
156
  def seed: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-15 00:00:00.000000000 Z
11
+ date: 2023-07-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: