llama_cpp 0.9.4 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -42,7 +42,7 @@
|
|
42
42
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
43
43
|
|
44
44
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
45
|
-
#define LLAMA_SESSION_VERSION
|
45
|
+
#define LLAMA_SESSION_VERSION 3
|
46
46
|
|
47
47
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
48
48
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
@@ -158,6 +158,22 @@ extern "C" {
|
|
158
158
|
llama_seq_id all_seq_id; // used if seq_id == NULL
|
159
159
|
} llama_batch;
|
160
160
|
|
161
|
+
enum llama_model_kv_override_type {
|
162
|
+
LLAMA_KV_OVERRIDE_INT,
|
163
|
+
LLAMA_KV_OVERRIDE_FLOAT,
|
164
|
+
LLAMA_KV_OVERRIDE_BOOL,
|
165
|
+
};
|
166
|
+
|
167
|
+
struct llama_model_kv_override {
|
168
|
+
char key[128];
|
169
|
+
enum llama_model_kv_override_type tag;
|
170
|
+
union {
|
171
|
+
int64_t int_value;
|
172
|
+
double float_value;
|
173
|
+
bool bool_value;
|
174
|
+
};
|
175
|
+
};
|
176
|
+
|
161
177
|
struct llama_model_params {
|
162
178
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
163
179
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
@@ -165,9 +181,13 @@ extern "C" {
|
|
165
181
|
|
166
182
|
// called with a progress value between 0 and 1, pass NULL to disable
|
167
183
|
llama_progress_callback progress_callback;
|
184
|
+
|
168
185
|
// context pointer passed to the progress callback
|
169
186
|
void * progress_callback_user_data;
|
170
187
|
|
188
|
+
// override key-value pairs of the model meta data
|
189
|
+
const struct llama_model_kv_override * kv_overrides;
|
190
|
+
|
171
191
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
172
192
|
bool vocab_only; // only load the vocabulary, no weights
|
173
193
|
bool use_mmap; // use mmap if possible
|
@@ -185,17 +205,20 @@ extern "C" {
|
|
185
205
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
186
206
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
207
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
-
float yarn_ext_factor; // YaRN extrapolation mix factor,
|
208
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
189
209
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
210
|
float yarn_beta_fast; // YaRN low correction dim
|
191
211
|
float yarn_beta_slow; // YaRN high correction dim
|
192
212
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
193
213
|
|
214
|
+
enum ggml_type type_k; // data type for K cache
|
215
|
+
enum ggml_type type_v; // data type for V cache
|
216
|
+
|
194
217
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
195
|
-
bool mul_mat_q;
|
196
|
-
bool
|
197
|
-
bool
|
198
|
-
bool
|
218
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
219
|
+
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
220
|
+
bool embedding; // embedding mode only
|
221
|
+
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
199
222
|
};
|
200
223
|
|
201
224
|
// model quantization parameters
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.10.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1620'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -23,6 +23,10 @@ module LLaMACpp
|
|
23
23
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
24
24
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
25
25
|
|
26
|
+
LLAMA_KV_OVERRIDE_INT: Integer
|
27
|
+
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
28
|
+
LLAMA_KV_OVERRIDE_BOOL: Integer
|
29
|
+
|
26
30
|
LLAMA_GRETYPE_END: Integer
|
27
31
|
LLAMA_GRETYPE_ALT: Integer
|
28
32
|
LLAMA_GRETYPE_RULE_REF: Integer
|
@@ -116,6 +120,16 @@ module LLaMACpp
|
|
116
120
|
def n_eval: () -> Integer
|
117
121
|
end
|
118
122
|
|
123
|
+
class ModelKVOverride
|
124
|
+
public
|
125
|
+
|
126
|
+
def key: () -> String
|
127
|
+
def tag: () -> Integer
|
128
|
+
def int_value: () -> Integer
|
129
|
+
def float_value: () -> Float
|
130
|
+
def bool_value: () -> bool
|
131
|
+
end
|
132
|
+
|
119
133
|
class ModelParams
|
120
134
|
public
|
121
135
|
|
@@ -225,14 +239,18 @@ module LLaMACpp
|
|
225
239
|
def yarn_beta_slow: () -> Float
|
226
240
|
def yarn_orig_ctx=: (Integer) -> Integer
|
227
241
|
def yarn_orig_ctx: () -> Integer
|
242
|
+
def type_k=: (Integer) -> Integer
|
243
|
+
def type_k: () -> Integer
|
244
|
+
def type_v=: (Integer) -> Integer
|
245
|
+
def type_v: () -> Integer
|
228
246
|
def mul_mat_q: () -> bool
|
229
247
|
def mul_mat_q=: (bool) -> bool
|
230
|
-
def f16_kv: () -> bool
|
231
|
-
def f16_kv=: (bool) -> bool
|
232
248
|
def logits_all: () -> bool
|
233
249
|
def logits_all=: (bool) -> bool
|
234
250
|
def embedding: () -> bool
|
235
251
|
def embedding=: (bool) -> bool
|
252
|
+
def offload_kqv: () -> bool
|
253
|
+
def offload_kqv=: (bool) -> bool
|
236
254
|
end
|
237
255
|
|
238
256
|
class ModelQuantizeParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-12-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
- !ruby/object:Gem::Version
|
81
81
|
version: '0'
|
82
82
|
requirements: []
|
83
|
-
rubygems_version: 3.4.
|
83
|
+
rubygems_version: 3.4.22
|
84
84
|
signing_key:
|
85
85
|
specification_version: 4
|
86
86
|
summary: Ruby bindings for the llama.cpp.
|