llama_cpp 0.9.5 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -42,7 +42,7 @@
42
42
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
43
43
 
44
44
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
45
- #define LLAMA_SESSION_VERSION 2
45
+ #define LLAMA_SESSION_VERSION 3
46
46
 
47
47
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
48
48
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -158,6 +158,22 @@ extern "C" {
158
158
  llama_seq_id all_seq_id; // used if seq_id == NULL
159
159
  } llama_batch;
160
160
 
161
+ enum llama_model_kv_override_type {
162
+ LLAMA_KV_OVERRIDE_INT,
163
+ LLAMA_KV_OVERRIDE_FLOAT,
164
+ LLAMA_KV_OVERRIDE_BOOL,
165
+ };
166
+
167
+ struct llama_model_kv_override {
168
+ char key[128];
169
+ enum llama_model_kv_override_type tag;
170
+ union {
171
+ int64_t int_value;
172
+ double float_value;
173
+ bool bool_value;
174
+ };
175
+ };
176
+
161
177
  struct llama_model_params {
162
178
  int32_t n_gpu_layers; // number of layers to store in VRAM
163
179
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
@@ -165,9 +181,13 @@ extern "C" {
165
181
 
166
182
  // called with a progress value between 0 and 1, pass NULL to disable
167
183
  llama_progress_callback progress_callback;
184
+
168
185
  // context pointer passed to the progress callback
169
186
  void * progress_callback_user_data;
170
187
 
188
+ // override key-value pairs of the model meta data
189
+ const struct llama_model_kv_override * kv_overrides;
190
+
171
191
  // Keep the booleans together to avoid misalignment during copy-by-value.
172
192
  bool vocab_only; // only load the vocabulary, no weights
173
193
  bool use_mmap; // use mmap if possible
@@ -191,11 +211,14 @@ extern "C" {
191
211
  float yarn_beta_slow; // YaRN high correction dim
192
212
  uint32_t yarn_orig_ctx; // YaRN original context size
193
213
 
214
+ enum ggml_type type_k; // data type for K cache
215
+ enum ggml_type type_v; // data type for V cache
216
+
194
217
  // Keep the booleans together to avoid misalignment during copy-by-value.
195
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
196
- bool f16_kv; // use fp16 for KV cache, fp32 otherwise
197
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
198
- bool embedding; // embedding mode only
218
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
219
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one
220
+ bool embedding; // embedding mode only
221
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
199
222
  };
200
223
 
201
224
  // model quantization parameters
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.5'
6
+ VERSION = '0.10.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1593'
9
+ LLAMA_CPP_VERSION = 'b1620'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -23,6 +23,10 @@ module LLaMACpp
23
23
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
24
24
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
25
25
 
26
+ LLAMA_KV_OVERRIDE_INT: Integer
27
+ LLAMA_KV_OVERRIDE_FLOAT: Integer
28
+ LLAMA_KV_OVERRIDE_BOOL: Integer
29
+
26
30
  LLAMA_GRETYPE_END: Integer
27
31
  LLAMA_GRETYPE_ALT: Integer
28
32
  LLAMA_GRETYPE_RULE_REF: Integer
@@ -116,6 +120,16 @@ module LLaMACpp
116
120
  def n_eval: () -> Integer
117
121
  end
118
122
 
123
+ class ModelKVOverride
124
+ public
125
+
126
+ def key: () -> String
127
+ def tag: () -> Integer
128
+ def int_value: () -> Integer
129
+ def float_value: () -> Float
130
+ def bool_value: () -> bool
131
+ end
132
+
119
133
  class ModelParams
120
134
  public
121
135
 
@@ -225,14 +239,18 @@ module LLaMACpp
225
239
  def yarn_beta_slow: () -> Float
226
240
  def yarn_orig_ctx=: (Integer) -> Integer
227
241
  def yarn_orig_ctx: () -> Integer
242
+ def type_k=: (Integer) -> Integer
243
+ def type_k: () -> Integer
244
+ def type_v=: (Integer) -> Integer
245
+ def type_v: () -> Integer
228
246
  def mul_mat_q: () -> bool
229
247
  def mul_mat_q=: (bool) -> bool
230
- def f16_kv: () -> bool
231
- def f16_kv=: (bool) -> bool
232
248
  def logits_all: () -> bool
233
249
  def logits_all=: (bool) -> bool
234
250
  def embedding: () -> bool
235
251
  def embedding=: (bool) -> bool
252
+ def offload_kqv: () -> bool
253
+ def offload_kqv=: (bool) -> bool
236
254
  end
237
255
 
238
256
  class ModelQuantizeParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.5
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-02 00:00:00.000000000 Z
11
+ date: 2023-12-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: