llama_cpp 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -106,6 +106,14 @@ extern "C" {
106
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
107
107
  };
108
108
 
109
+ enum llama_rope_scaling_type {
110
+ LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
111
+ LLAMA_ROPE_SCALING_NONE = 0,
112
+ LLAMA_ROPE_SCALING_LINEAR = 1,
113
+ LLAMA_ROPE_SCALING_YARN = 2,
114
+ LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
115
+ };
116
+
109
117
  typedef struct llama_token_data {
110
118
  llama_token id; // token id
111
119
  float logit; // log-odds of the token
@@ -172,13 +180,19 @@ extern "C" {
172
180
  uint32_t n_batch; // prompt processing maximum batch size
173
181
  uint32_t n_threads; // number of threads to use for generation
174
182
  uint32_t n_threads_batch; // number of threads to use for batch processing
183
+ int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
175
184
 
176
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
177
- float rope_freq_base; // RoPE base frequency, 0 = from model
178
- float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
186
+ float rope_freq_base; // RoPE base frequency, 0 = from model
187
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
+ float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
189
+ float yarn_attn_factor; // YaRN magnitude scaling factor
190
+ float yarn_beta_fast; // YaRN low correction dim
191
+ float yarn_beta_slow; // YaRN high correction dim
192
+ uint32_t yarn_orig_ctx; // YaRN original context size
179
193
 
180
194
  // Keep the booleans together to avoid misalignment during copy-by-value.
181
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels
195
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
182
196
  bool f16_kv; // use fp16 for KV cache, fp32 otherwise
183
197
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
184
198
  bool embedding; // embedding mode only
@@ -191,6 +205,7 @@ extern "C" {
191
205
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
192
206
  bool quantize_output_tensor; // quantize output.weight
193
207
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
208
+ bool pure; // disable k-quant mixtures and quantize all tensors to the same type
194
209
  } llama_model_quantize_params;
195
210
 
196
211
  // grammar types
@@ -333,17 +348,14 @@ extern "C" {
333
348
  LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
334
349
  "avoid using this, it will be removed in the future, instead - count the tokens in user code");
335
350
 
336
- // Remove all tokens data of cells in [c0, c1)
337
- // c0 < 0 : [0, c1]
338
- // c1 < 0 : [c0, inf)
339
- LLAMA_API void llama_kv_cache_tokens_rm(
340
- struct llama_context * ctx,
341
- int32_t c0,
342
- int32_t c1);
351
+ // Clear the KV cache
352
+ LLAMA_API void llama_kv_cache_clear(
353
+ struct llama_context * ctx);
343
354
 
344
355
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
345
- // p0 < 0 : [0, p1]
346
- // p1 < 0 : [p0, inf)
356
+ // seq_id < 0 : match any sequence
357
+ // p0 < 0 : [0, p1]
358
+ // p1 < 0 : [p0, inf)
347
359
  LLAMA_API void llama_kv_cache_seq_rm(
348
360
  struct llama_context * ctx,
349
361
  llama_seq_id seq_id,
@@ -494,21 +506,22 @@ extern "C" {
494
506
  // Vocab
495
507
  //
496
508
 
497
- LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
509
+ LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
498
510
 
499
- LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
511
+ LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
500
512
 
501
- LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
513
+ LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
502
514
 
503
515
  // Special tokens
504
- LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
505
- LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
506
- LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
516
+ LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
517
+ LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
518
+ LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
519
+
507
520
  // codellama infill tokens
508
- LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
509
- LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
510
- LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
511
- LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
521
+ LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
522
+ LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
523
+ LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
524
+ LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
512
525
 
513
526
  //
514
527
  // Tokenization
@@ -560,21 +573,15 @@ extern "C" {
560
573
  LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
561
574
 
562
575
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
563
- LLAMA_API void llama_sample_repetition_penalty(
564
- struct llama_context * ctx,
565
- llama_token_data_array * candidates,
566
- const llama_token * last_tokens,
567
- size_t last_tokens_size,
568
- float penalty);
569
-
570
576
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
571
- LLAMA_API void llama_sample_frequency_and_presence_penalties(
577
+ LLAMA_API void llama_sample_repetition_penalties(
572
578
  struct llama_context * ctx,
573
579
  llama_token_data_array * candidates,
574
580
  const llama_token * last_tokens,
575
- size_t last_tokens_size,
576
- float alpha_frequency,
577
- float alpha_presence);
581
+ size_t penalty_last_n,
582
+ float penalty_repeat,
583
+ float penalty_freq,
584
+ float penalty_present);
578
585
 
579
586
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
580
587
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -605,6 +612,13 @@ extern "C" {
605
612
  float p,
606
613
  size_t min_keep);
607
614
 
615
+ /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
616
+ LLAMA_API void llama_sample_min_p(
617
+ struct llama_context * ctx,
618
+ llama_token_data_array * candidates,
619
+ float p,
620
+ size_t min_keep);
621
+
608
622
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
609
623
  LLAMA_API void llama_sample_tail_free(
610
624
  struct llama_context * ctx,
@@ -663,6 +677,7 @@ extern "C" {
663
677
  float * mu);
664
678
 
665
679
  /// @details Selects the token with the highest probability.
680
+ /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
666
681
  LLAMA_API llama_token llama_sample_token_greedy(
667
682
  struct llama_context * ctx,
668
683
  llama_token_data_array * candidates);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.8.0'
6
+ VERSION = '0.9.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1405'
9
+ LLAMA_CPP_VERSION = 'b1472'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
67
67
 
68
68
  # apply penalties
69
69
  last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
70
- context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
71
- context.sample_frequency_and_presence_penalties(
72
- candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
70
+ context.sample_repetition_penalties(
71
+ candidates, last_n_tokens[-last_n_repeat..],
72
+ penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
73
73
  )
74
74
 
75
75
  # temperature sampling
@@ -97,7 +97,7 @@ module LLaMACpp
97
97
 
98
98
  embd.each { |token| output << context.model.token_to_piece(token) }
99
99
 
100
- break if !embd.empty? && embd[-1] == context.token_eos
100
+ break if !embd.empty? && embd[-1] == context.model.token_eos
101
101
  end
102
102
 
103
103
  output.join.scrub('?').strip.delete_prefix(prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
31
31
  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
32
32
  LLAMA_GRETYPE_CHAR_ALT: Integer
33
33
 
34
+ LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
35
+ LLAMA_ROPE_SCALING_NONE: Integer
36
+ LLAMA_ROPE_SCALING_LINEAR: Integer
37
+ LLAMA_ROPE_SCALING_YARN: Integer
38
+ LLAMA_ROPE_SCALING_MAX_VALUE: Integer
39
+
34
40
  def self?.backend_init: (?numa: bool) -> void
35
41
  def self?.backend_free: () -> void
36
42
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -82,6 +88,16 @@ module LLaMACpp
82
88
  def desc: () -> String
83
89
  def size: () -> Integer
84
90
  def n_params: () -> Integer
91
+ def text: (Integer) -> String
92
+ def score: (Integer) -> Float
93
+ def type: (Integer) -> Integer
94
+ def token_bos: () -> Integer
95
+ def token_eos: () -> Integer
96
+ def token_nl: () -> Integer
97
+ def token_prefix: () -> Integer
98
+ def token_middle: () -> Integer
99
+ def token_suffix: () -> Integer
100
+ def token_eot: () -> Integer
85
101
  end
86
102
 
87
103
  class Timings
@@ -143,16 +159,6 @@ module LLaMACpp
143
159
 
144
160
  def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
145
161
  def embeddings: () -> Array[Float]
146
- def text: (Integer) -> String
147
- def score: (Integer) -> Float
148
- def type: (Integer) -> Integer
149
- def token_bos: () -> Integer
150
- def token_eos: () -> Integer
151
- def token_nl: () -> Integer
152
- def token_prefix: () -> Integer
153
- def token_middle: () -> Integer
154
- def token_suffix: () -> Integer
155
- def token_eot: () -> Integer
156
162
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
157
163
  def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
158
164
  def decode: (::LLaMACpp::Batch) -> void
@@ -162,20 +168,20 @@ module LLaMACpp
162
168
  def print_timings: () -> void
163
169
  def reset_timings: () -> void
164
170
  def kv_cache_token_count: () -> Integer
165
- def kv_cache_tokens_rm: (Integer, Integer) -> void
171
+ def kv_cache_clear: () -> void
166
172
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
167
173
  def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
168
174
  def kv_cache_seq_keep: (Integer) -> void
169
- def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
175
+ def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
170
176
  def set_rng_seed: (Integer) -> void
171
177
  def load_session_file: (session_path: String) -> void
172
178
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
173
- def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
174
- def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
179
+ def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
175
180
  def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
176
181
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
177
182
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
178
183
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
184
+ def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
179
185
  def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
180
186
  def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
181
187
  def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
@@ -201,10 +207,22 @@ module LLaMACpp
201
207
  def n_threads=: (Integer) -> Integer
202
208
  def n_threads_batch: () -> Integer
203
209
  def n_threads_batch=: (Integer) -> Integer
210
+ def rope_scaling_type=: (Integer) -> Integer
211
+ def rope_scaling_type: () -> Integer
204
212
  def rope_freq_base=: (Float) -> Float
205
213
  def rope_freq_base: () -> Float
206
214
  def rope_freq_scale=: (Float) -> Float
207
215
  def rope_freq_scale: () -> Float
216
+ def yarn_ext_factor=: (Float) -> Float
217
+ def yarn_ext_factor: () -> Float
218
+ def yarn_attn_factor=: (Float) -> Float
219
+ def yarn_attn_factor: () -> Float
220
+ def yarn_beta_fast=: (Float) -> Float
221
+ def yarn_beta_fast: () -> Float
222
+ def yarn_beta_slow=: (Float) -> Float
223
+ def yarn_beta_slow: () -> Float
224
+ def yarn_orig_ctx=: (Integer) -> Integer
225
+ def yarn_orig_ctx: () -> Integer
208
226
  def mul_mat_q: () -> bool
209
227
  def mul_mat_q=: (bool) -> bool
210
228
  def f16_kv: () -> bool
@@ -228,6 +246,8 @@ module LLaMACpp
228
246
  def quantize_output_tensor=: (bool) -> bool
229
247
  def only_copy: () -> bool
230
248
  def only_copy=: (bool) -> bool
249
+ def pure: () -> bool
250
+ def pure=: (bool) -> bool
231
251
  end
232
252
 
233
253
  class Params = ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-21 00:00:00.000000000 Z
11
+ date: 2023-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -36,6 +36,7 @@ files:
36
36
  - ext/llama_cpp/src/ggml-backend.h
37
37
  - ext/llama_cpp/src/ggml-cuda.cu
38
38
  - ext/llama_cpp/src/ggml-cuda.h
39
+ - ext/llama_cpp/src/ggml-impl.h
39
40
  - ext/llama_cpp/src/ggml-metal.h
40
41
  - ext/llama_cpp/src/ggml-metal.m
41
42
  - ext/llama_cpp/src/ggml-metal.metal
@@ -43,10 +44,10 @@ files:
43
44
  - ext/llama_cpp/src/ggml-mpi.h
44
45
  - ext/llama_cpp/src/ggml-opencl.cpp
45
46
  - ext/llama_cpp/src/ggml-opencl.h
47
+ - ext/llama_cpp/src/ggml-quants.c
48
+ - ext/llama_cpp/src/ggml-quants.h
46
49
  - ext/llama_cpp/src/ggml.c
47
50
  - ext/llama_cpp/src/ggml.h
48
- - ext/llama_cpp/src/k_quants.c
49
- - ext/llama_cpp/src/k_quants.h
50
51
  - ext/llama_cpp/src/llama-util.h
51
52
  - ext/llama_cpp/src/llama.cpp
52
53
  - ext/llama_cpp/src/llama.h