llama_cpp 0.15.2 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,9 +81,11 @@ extern "C" {
81
81
  LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82
82
  LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83
83
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85
- LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86
- LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
84
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
85
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
86
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
87
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
88
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
87
89
  };
88
90
 
89
91
  // note: these values should be synchronized with ggml_rope
@@ -263,6 +265,8 @@ extern "C" {
263
265
  bool check_tensors; // validate model tensor data
264
266
  };
265
267
 
268
+ // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
269
+ // https://github.com/ggerganov/llama.cpp/pull/7544
266
270
  struct llama_context_params {
267
271
  uint32_t seed; // RNG seed, -1 for random
268
272
  uint32_t n_ctx; // text context, 0 = from model
@@ -289,14 +293,14 @@ extern "C" {
289
293
  ggml_backend_sched_eval_callback cb_eval;
290
294
  void * cb_eval_user_data;
291
295
 
292
- enum ggml_type type_k; // data type for K cache
293
- enum ggml_type type_v; // data type for V cache
296
+ enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
297
+ enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
294
298
 
295
299
  // Keep the booleans together to avoid misalignment during copy-by-value.
296
300
  bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
297
301
  bool embeddings; // if true, extract embeddings (together with logits)
298
302
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
299
- bool flash_attn; // whether to use flash attention
303
+ bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
300
304
 
301
305
  // Abort callback
302
306
  // if it returns true, execution of llama_decode() will be aborted
@@ -420,8 +424,8 @@ extern "C" {
420
424
 
421
425
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
422
426
 
423
- LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
424
- LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
427
+ LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
428
+ LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
425
429
 
426
430
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
427
431
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -758,6 +762,12 @@ extern "C" {
758
762
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
759
763
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
760
764
 
765
+ // Get the number of threads used for generation of a single token.
766
+ LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
767
+
768
+ // Get the number of threads used for prompt and batch processing (multiple token).
769
+ LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
770
+
761
771
  // Set whether to use causal attention or not
762
772
  // If set to true, the model will only attend to the past tokens
763
773
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -816,6 +826,9 @@ extern "C" {
816
826
  // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
817
827
  LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
818
828
 
829
+ // Identify if Token Id is a control token or a render-able token
830
+ LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
831
+
819
832
  // Special tokens
820
833
  LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
821
834
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.2
4
+ version: 0.15.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-18 00:00:00.000000000 Z
11
+ date: 2024-06-01 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -99,7 +99,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
- rubygems_version: 3.5.9
102
+ rubygems_version: 3.5.10
103
103
  signing_key:
104
104
  specification_version: 4
105
105
  summary: Ruby bindings for the llama.cpp.