llama_cpp 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -106,6 +106,14 @@ extern "C" {
106
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
107
107
  };
108
108
 
109
+ enum llama_rope_scaling_type {
110
+ LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
111
+ LLAMA_ROPE_SCALING_NONE = 0,
112
+ LLAMA_ROPE_SCALING_LINEAR = 1,
113
+ LLAMA_ROPE_SCALING_YARN = 2,
114
+ LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
115
+ };
116
+
109
117
  typedef struct llama_token_data {
110
118
  llama_token id; // token id
111
119
  float logit; // log-odds of the token
@@ -172,13 +180,19 @@ extern "C" {
172
180
  uint32_t n_batch; // prompt processing maximum batch size
173
181
  uint32_t n_threads; // number of threads to use for generation
174
182
  uint32_t n_threads_batch; // number of threads to use for batch processing
183
+ int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
175
184
 
176
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
177
- float rope_freq_base; // RoPE base frequency, 0 = from model
178
- float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
186
+ float rope_freq_base; // RoPE base frequency, 0 = from model
187
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
+ float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
189
+ float yarn_attn_factor; // YaRN magnitude scaling factor
190
+ float yarn_beta_fast; // YaRN low correction dim
191
+ float yarn_beta_slow; // YaRN high correction dim
192
+ uint32_t yarn_orig_ctx; // YaRN original context size
179
193
 
180
194
  // Keep the booleans together to avoid misalignment during copy-by-value.
181
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
195
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
182
196
  bool f16_kv; // use fp16 for KV cache, fp32 otherwise
183
197
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
184
198
  bool embedding; // embedding mode only
@@ -191,6 +205,7 @@ extern "C" {
191
205
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
192
206
  bool quantize_output_tensor; // quantize output.weight
193
207
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
208
+ bool pure; // disable k-quant mixtures and quantize all tensors to the same type
194
209
  } llama_model_quantize_params;
195
210
 
196
211
  // grammar types
@@ -333,17 +348,14 @@ extern "C" {
333
348
  LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
334
349
  "avoid using this, it will be removed in the future, instead - count the tokens in user code");
335
350
 
336
- // Remove all tokens data of cells in [c0, c1)
337
- // c0 < 0 : [0, c1]
338
- // c1 < 0 : [c0, inf)
339
- LLAMA_API void llama_kv_cache_tokens_rm(
340
- struct llama_context * ctx,
341
- int32_t c0,
342
- int32_t c1);
351
+ // Clear the KV cache
352
+ LLAMA_API void llama_kv_cache_clear(
353
+ struct llama_context * ctx);
343
354
 
344
355
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
345
- // p0 < 0 : [0, p1]
346
- // p1 < 0 : [p0, inf)
356
+ // seq_id < 0 : match any sequence
357
+ // p0 < 0 : [0, p1]
358
+ // p1 < 0 : [p0, inf)
347
359
  LLAMA_API void llama_kv_cache_seq_rm(
348
360
  struct llama_context * ctx,
349
361
  llama_seq_id seq_id,
@@ -600,6 +612,13 @@ extern "C" {
600
612
  float p,
601
613
  size_t min_keep);
602
614
 
615
+ /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
616
+ LLAMA_API void llama_sample_min_p(
617
+ struct llama_context * ctx,
618
+ llama_token_data_array * candidates,
619
+ float p,
620
+ size_t min_keep);
621
+
603
622
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
604
623
  LLAMA_API void llama_sample_tail_free(
605
624
  struct llama_context * ctx,
@@ -658,6 +677,7 @@ extern "C" {
658
677
  float * mu);
659
678
 
660
679
  /// @details Selects the token with the highest probability.
680
+ /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
661
681
  LLAMA_API llama_token llama_sample_token_greedy(
662
682
  struct llama_context * ctx,
663
683
  llama_token_data_array * candidates);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.0'
6
+ VERSION = '0.9.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1429'
9
+ LLAMA_CPP_VERSION = 'b1472'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
31
31
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
32
32
  LLAMA_GRETYPE_CHAR_ALT: Integer
33
33
 
34
+ LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
35
+ LLAMA_ROPE_SCALING_NONE: Integer
36
+ LLAMA_ROPE_SCALING_LINEAR: Integer
37
+ LLAMA_ROPE_SCALING_YARN: Integer
38
+ LLAMA_ROPE_SCALING_MAX_VALUE: Integer
39
+
34
40
  def self?.backend_init: (?numa: bool) -> void
35
41
  def self?.backend_free: () -> void
36
42
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -162,11 +168,11 @@ module LLaMACpp
162
168
  def print_timings: () -> void
163
169
  def reset_timings: () -> void
164
170
  def kv_cache_token_count: () -> Integer
165
- def kv_cache_tokens_rm: (Integer, Integer) -> void
171
+ def kv_cache_clear: () -> void
166
172
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
167
173
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
168
174
  def kv_cache_seq_keep: (Integer) -> void
169
- def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
175
+ def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
170
176
  def set_rng_seed: (Integer) -> void
171
177
  def load_session_file: (session_path: String) -> void
172
178
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
@@ -175,6 +181,7 @@ module LLaMACpp
175
181
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
176
182
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
177
183
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
184
+ def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
178
185
  def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
179
186
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
180
187
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
@@ -200,10 +207,22 @@ module LLaMACpp
200
207
  def n_threads=: (Integer) -> Integer
201
208
  def n_threads_batch: () -> Integer
202
209
  def n_threads_batch=: (Integer) -> Integer
210
+ def rope_scaling_type=: (Integer) -> Integer
211
+ def rope_scaling_type: () -> Integer
203
212
  def rope_freq_base=: (Float) -> Float
204
213
  def rope_freq_base: () -> Float
205
214
  def rope_freq_scale=: (Float) -> Float
206
215
  def rope_freq_scale: () -> Float
216
+ def yarn_ext_factor=: (Float) -> Float
217
+ def yarn_ext_factor: () -> Float
218
+ def yarn_attn_factor=: (Float) -> Float
219
+ def yarn_attn_factor: () -> Float
220
+ def yarn_beta_fast=: (Float) -> Float
221
+ def yarn_beta_fast: () -> Float
222
+ def yarn_beta_slow=: (Float) -> Float
223
+ def yarn_beta_slow: () -> Float
224
+ def yarn_orig_ctx=: (Integer) -> Integer
225
+ def yarn_orig_ctx: () -> Integer
207
226
  def mul_mat_q: () -> bool
208
227
  def mul_mat_q=: (bool) -> bool
209
228
  def f16_kv: () -> bool
@@ -227,6 +246,8 @@ module LLaMACpp
227
246
  def quantize_output_tensor=: (bool) -> bool
228
247
  def only_copy: () -> bool
229
248
  def only_copy=: (bool) -> bool
249
+ def pure: () -> bool
250
+ def pure=: (bool) -> bool
230
251
  end
231
252
 
232
253
  class Params = ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-28 00:00:00.000000000 Z
11
+ date: 2023-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -36,6 +36,7 @@ files:
36
36
  - ext/llama_cpp/src/ggml-backend.h
37
37
  - ext/llama_cpp/src/ggml-cuda.cu
38
38
  - ext/llama_cpp/src/ggml-cuda.h
39
+ - ext/llama_cpp/src/ggml-impl.h
39
40
  - ext/llama_cpp/src/ggml-metal.h
40
41
  - ext/llama_cpp/src/ggml-metal.m
41
42
  - ext/llama_cpp/src/ggml-metal.metal
@@ -43,10 +44,10 @@ files:
43
44
  - ext/llama_cpp/src/ggml-mpi.h
44
45
  - ext/llama_cpp/src/ggml-opencl.cpp
45
46
  - ext/llama_cpp/src/ggml-opencl.h
47
+ - ext/llama_cpp/src/ggml-quants.c
48
+ - ext/llama_cpp/src/ggml-quants.h
46
49
  - ext/llama_cpp/src/ggml.c
47
50
  - ext/llama_cpp/src/ggml.h
48
- - ext/llama_cpp/src/k_quants.c
49
- - ext/llama_cpp/src/k_quants.h
50
51
  - ext/llama_cpp/src/llama-util.h
51
52
  - ext/llama_cpp/src/llama.cpp
52
53
  - ext/llama_cpp/src/llama.h