llama_cpp 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -106,6 +106,14 @@ extern "C" {
106
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
107
107
  };
108
108
 
109
+ enum llama_rope_scaling_type {
110
+ LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
111
+ LLAMA_ROPE_SCALING_NONE = 0,
112
+ LLAMA_ROPE_SCALING_LINEAR = 1,
113
+ LLAMA_ROPE_SCALING_YARN = 2,
114
+ LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
115
+ };
116
+
109
117
  typedef struct llama_token_data {
110
118
  llama_token id; // token id
111
119
  float logit; // log-odds of the token
@@ -172,13 +180,19 @@ extern "C" {
172
180
  uint32_t n_batch; // prompt processing maximum batch size
173
181
  uint32_t n_threads; // number of threads to use for generation
174
182
  uint32_t n_threads_batch; // number of threads to use for batch processing
183
+ int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
175
184
 
176
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
177
- float rope_freq_base; // RoPE base frequency, 0 = from model
178
- float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
186
+ float rope_freq_base; // RoPE base frequency, 0 = from model
187
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
+ float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
189
+ float yarn_attn_factor; // YaRN magnitude scaling factor
190
+ float yarn_beta_fast; // YaRN low correction dim
191
+ float yarn_beta_slow; // YaRN high correction dim
192
+ uint32_t yarn_orig_ctx; // YaRN original context size
179
193
 
180
194
  // Keep the booleans together to avoid misalignment during copy-by-value.
181
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
195
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
182
196
  bool f16_kv; // use fp16 for KV cache, fp32 otherwise
183
197
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
184
198
  bool embedding; // embedding mode only
@@ -191,6 +205,7 @@ extern "C" {
191
205
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
192
206
  bool quantize_output_tensor; // quantize output.weight
193
207
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
208
+ bool pure; // disable k-quant mixtures and quantize all tensors to the same type
194
209
  } llama_model_quantize_params;
195
210
 
196
211
  // grammar types
@@ -333,17 +348,14 @@ extern "C" {
333
348
  LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
334
349
  "avoid using this, it will be removed in the future, instead - count the tokens in user code");
335
350
 
336
- // Remove all tokens data of cells in [c0, c1)
337
- // c0 < 0 : [0, c1]
338
- // c1 < 0 : [c0, inf)
339
- LLAMA_API void llama_kv_cache_tokens_rm(
340
- struct llama_context * ctx,
341
- int32_t c0,
342
- int32_t c1);
351
+ // Clear the KV cache
352
+ LLAMA_API void llama_kv_cache_clear(
353
+ struct llama_context * ctx);
343
354
 
344
355
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
345
- // p0 < 0 : [0, p1]
346
- // p1 < 0 : [p0, inf)
356
+ // seq_id < 0 : match any sequence
357
+ // p0 < 0 : [0, p1]
358
+ // p1 < 0 : [p0, inf)
347
359
  LLAMA_API void llama_kv_cache_seq_rm(
348
360
  struct llama_context * ctx,
349
361
  llama_seq_id seq_id,
@@ -600,6 +612,13 @@ extern "C" {
600
612
  float p,
601
613
  size_t min_keep);
602
614
 
615
+ /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
616
+ LLAMA_API void llama_sample_min_p(
617
+ struct llama_context * ctx,
618
+ llama_token_data_array * candidates,
619
+ float p,
620
+ size_t min_keep);
621
+
603
622
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
604
623
  LLAMA_API void llama_sample_tail_free(
605
624
  struct llama_context * ctx,
@@ -658,6 +677,7 @@ extern "C" {
658
677
  float * mu);
659
678
 
660
679
  /// @details Selects the token with the highest probability.
680
+ /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
661
681
  LLAMA_API llama_token llama_sample_token_greedy(
662
682
  struct llama_context * ctx,
663
683
  llama_token_data_array * candidates);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.0'
6
+ VERSION = '0.9.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1429'
9
+ LLAMA_CPP_VERSION = 'b1472'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
31
31
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
32
32
  LLAMA_GRETYPE_CHAR_ALT: Integer
33
33
 
34
+ LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
35
+ LLAMA_ROPE_SCALING_NONE: Integer
36
+ LLAMA_ROPE_SCALING_LINEAR: Integer
37
+ LLAMA_ROPE_SCALING_YARN: Integer
38
+ LLAMA_ROPE_SCALING_MAX_VALUE: Integer
39
+
34
40
  def self?.backend_init: (?numa: bool) -> void
35
41
  def self?.backend_free: () -> void
36
42
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -162,11 +168,11 @@ module LLaMACpp
162
168
  def print_timings: () -> void
163
169
  def reset_timings: () -> void
164
170
  def kv_cache_token_count: () -> Integer
165
- def kv_cache_tokens_rm: (Integer, Integer) -> void
171
+ def kv_cache_clear: () -> void
166
172
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
167
173
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
168
174
  def kv_cache_seq_keep: (Integer) -> void
169
- def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
175
+ def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
170
176
  def set_rng_seed: (Integer) -> void
171
177
  def load_session_file: (session_path: String) -> void
172
178
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
@@ -175,6 +181,7 @@ module LLaMACpp
175
181
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
176
182
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
177
183
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
184
+ def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
178
185
  def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
179
186
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
180
187
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
@@ -200,10 +207,22 @@ module LLaMACpp
200
207
  def n_threads=: (Integer) -> Integer
201
208
  def n_threads_batch: () -> Integer
202
209
  def n_threads_batch=: (Integer) -> Integer
210
+ def rope_scaling_type=: (Integer) -> Integer
211
+ def rope_scaling_type: () -> Integer
203
212
  def rope_freq_base=: (Float) -> Float
204
213
  def rope_freq_base: () -> Float
205
214
  def rope_freq_scale=: (Float) -> Float
206
215
  def rope_freq_scale: () -> Float
216
+ def yarn_ext_factor=: (Float) -> Float
217
+ def yarn_ext_factor: () -> Float
218
+ def yarn_attn_factor=: (Float) -> Float
219
+ def yarn_attn_factor: () -> Float
220
+ def yarn_beta_fast=: (Float) -> Float
221
+ def yarn_beta_fast: () -> Float
222
+ def yarn_beta_slow=: (Float) -> Float
223
+ def yarn_beta_slow: () -> Float
224
+ def yarn_orig_ctx=: (Integer) -> Integer
225
+ def yarn_orig_ctx: () -> Integer
207
226
  def mul_mat_q: () -> bool
208
227
  def mul_mat_q=: (bool) -> bool
209
228
  def f16_kv: () -> bool
@@ -227,6 +246,8 @@ module LLaMACpp
227
246
  def quantize_output_tensor=: (bool) -> bool
228
247
  def only_copy: () -> bool
229
248
  def only_copy=: (bool) -> bool
249
+ def pure: () -> bool
250
+ def pure=: (bool) -> bool
230
251
  end
231
252
 
232
253
  class Params = ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-28 00:00:00.000000000 Z
11
+ date: 2023-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -36,6 +36,7 @@ files:
36
36
  - ext/llama_cpp/src/ggml-backend.h
37
37
  - ext/llama_cpp/src/ggml-cuda.cu
38
38
  - ext/llama_cpp/src/ggml-cuda.h
39
+ - ext/llama_cpp/src/ggml-impl.h
39
40
  - ext/llama_cpp/src/ggml-metal.h
40
41
  - ext/llama_cpp/src/ggml-metal.m
41
42
  - ext/llama_cpp/src/ggml-metal.metal
@@ -43,10 +44,10 @@ files:
43
44
  - ext/llama_cpp/src/ggml-mpi.h
44
45
  - ext/llama_cpp/src/ggml-opencl.cpp
45
46
  - ext/llama_cpp/src/ggml-opencl.h
47
+ - ext/llama_cpp/src/ggml-quants.c
48
+ - ext/llama_cpp/src/ggml-quants.h
46
49
  - ext/llama_cpp/src/ggml.c
47
50
  - ext/llama_cpp/src/ggml.h
48
- - ext/llama_cpp/src/k_quants.c
49
- - ext/llama_cpp/src/k_quants.h
50
51
  - ext/llama_cpp/src/llama-util.h
51
52
  - ext/llama_cpp/src/llama.cpp
52
53
  - ext/llama_cpp/src/llama.h