llama_cpp 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -106,6 +106,14 @@ extern "C" {
|
|
106
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
107
107
|
};
|
108
108
|
|
109
|
+
enum llama_rope_scaling_type {
|
110
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
111
|
+
LLAMA_ROPE_SCALING_NONE = 0,
|
112
|
+
LLAMA_ROPE_SCALING_LINEAR = 1,
|
113
|
+
LLAMA_ROPE_SCALING_YARN = 2,
|
114
|
+
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
115
|
+
};
|
116
|
+
|
109
117
|
typedef struct llama_token_data {
|
110
118
|
llama_token id; // token id
|
111
119
|
float logit; // log-odds of the token
|
@@ -172,13 +180,19 @@ extern "C" {
|
|
172
180
|
uint32_t n_batch; // prompt processing maximum batch size
|
173
181
|
uint32_t n_threads; // number of threads to use for generation
|
174
182
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
183
|
+
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
175
184
|
|
176
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
177
|
-
float
|
178
|
-
float
|
186
|
+
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
|
+
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
189
|
+
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
|
+
float yarn_beta_fast; // YaRN low correction dim
|
191
|
+
float yarn_beta_slow; // YaRN high correction dim
|
192
|
+
uint32_t yarn_orig_ctx; // YaRN original context size
|
179
193
|
|
180
194
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
181
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
195
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
182
196
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
183
197
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
184
198
|
bool embedding; // embedding mode only
|
@@ -191,6 +205,7 @@ extern "C" {
|
|
191
205
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
192
206
|
bool quantize_output_tensor; // quantize output.weight
|
193
207
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
208
|
+
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
194
209
|
} llama_model_quantize_params;
|
195
210
|
|
196
211
|
// grammar types
|
@@ -333,17 +348,14 @@ extern "C" {
|
|
333
348
|
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
334
349
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
335
350
|
|
336
|
-
//
|
337
|
-
|
338
|
-
|
339
|
-
LLAMA_API void llama_kv_cache_tokens_rm(
|
340
|
-
struct llama_context * ctx,
|
341
|
-
int32_t c0,
|
342
|
-
int32_t c1);
|
351
|
+
// Clear the KV cache
|
352
|
+
LLAMA_API void llama_kv_cache_clear(
|
353
|
+
struct llama_context * ctx);
|
343
354
|
|
344
355
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
345
|
-
//
|
346
|
-
//
|
356
|
+
// seq_id < 0 : match any sequence
|
357
|
+
// p0 < 0 : [0, p1]
|
358
|
+
// p1 < 0 : [p0, inf)
|
347
359
|
LLAMA_API void llama_kv_cache_seq_rm(
|
348
360
|
struct llama_context * ctx,
|
349
361
|
llama_seq_id seq_id,
|
@@ -600,6 +612,13 @@ extern "C" {
|
|
600
612
|
float p,
|
601
613
|
size_t min_keep);
|
602
614
|
|
615
|
+
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
616
|
+
LLAMA_API void llama_sample_min_p(
|
617
|
+
struct llama_context * ctx,
|
618
|
+
llama_token_data_array * candidates,
|
619
|
+
float p,
|
620
|
+
size_t min_keep);
|
621
|
+
|
603
622
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
604
623
|
LLAMA_API void llama_sample_tail_free(
|
605
624
|
struct llama_context * ctx,
|
@@ -658,6 +677,7 @@ extern "C" {
|
|
658
677
|
float * mu);
|
659
678
|
|
660
679
|
/// @details Selects the token with the highest probability.
|
680
|
+
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
661
681
|
LLAMA_API llama_token llama_sample_token_greedy(
|
662
682
|
struct llama_context * ctx,
|
663
683
|
llama_token_data_array * candidates);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1472'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
|
|
31
31
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
32
32
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
33
33
|
|
34
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
|
35
|
+
LLAMA_ROPE_SCALING_NONE: Integer
|
36
|
+
LLAMA_ROPE_SCALING_LINEAR: Integer
|
37
|
+
LLAMA_ROPE_SCALING_YARN: Integer
|
38
|
+
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
39
|
+
|
34
40
|
def self?.backend_init: (?numa: bool) -> void
|
35
41
|
def self?.backend_free: () -> void
|
36
42
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -162,11 +168,11 @@ module LLaMACpp
|
|
162
168
|
def print_timings: () -> void
|
163
169
|
def reset_timings: () -> void
|
164
170
|
def kv_cache_token_count: () -> Integer
|
165
|
-
def
|
171
|
+
def kv_cache_clear: () -> void
|
166
172
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
167
173
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
168
174
|
def kv_cache_seq_keep: (Integer) -> void
|
169
|
-
def kv_cache_seq_shift: (Integer, Integer,
|
175
|
+
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
170
176
|
def set_rng_seed: (Integer) -> void
|
171
177
|
def load_session_file: (session_path: String) -> void
|
172
178
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -175,6 +181,7 @@ module LLaMACpp
|
|
175
181
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
176
182
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
177
183
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
184
|
+
def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
178
185
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
179
186
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
180
187
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
@@ -200,10 +207,22 @@ module LLaMACpp
|
|
200
207
|
def n_threads=: (Integer) -> Integer
|
201
208
|
def n_threads_batch: () -> Integer
|
202
209
|
def n_threads_batch=: (Integer) -> Integer
|
210
|
+
def rope_scaling_type=: (Integer) -> Integer
|
211
|
+
def rope_scaling_type: () -> Integer
|
203
212
|
def rope_freq_base=: (Float) -> Float
|
204
213
|
def rope_freq_base: () -> Float
|
205
214
|
def rope_freq_scale=: (Float) -> Float
|
206
215
|
def rope_freq_scale: () -> Float
|
216
|
+
def yarn_ext_factor=: (Float) -> Float
|
217
|
+
def yarn_ext_factor: () -> Float
|
218
|
+
def yarn_attn_factor=: (Float) -> Float
|
219
|
+
def yarn_attn_factor: () -> Float
|
220
|
+
def yarn_beta_fast=: (Float) -> Float
|
221
|
+
def yarn_beta_fast: () -> Float
|
222
|
+
def yarn_beta_slow=: (Float) -> Float
|
223
|
+
def yarn_beta_slow: () -> Float
|
224
|
+
def yarn_orig_ctx=: (Integer) -> Integer
|
225
|
+
def yarn_orig_ctx: () -> Integer
|
207
226
|
def mul_mat_q: () -> bool
|
208
227
|
def mul_mat_q=: (bool) -> bool
|
209
228
|
def f16_kv: () -> bool
|
@@ -227,6 +246,8 @@ module LLaMACpp
|
|
227
246
|
def quantize_output_tensor=: (bool) -> bool
|
228
247
|
def only_copy: () -> bool
|
229
248
|
def only_copy=: (bool) -> bool
|
249
|
+
def pure: () -> bool
|
250
|
+
def pure=: (bool) -> bool
|
230
251
|
end
|
231
252
|
|
232
253
|
class Params = ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- ext/llama_cpp/src/ggml-backend.h
|
37
37
|
- ext/llama_cpp/src/ggml-cuda.cu
|
38
38
|
- ext/llama_cpp/src/ggml-cuda.h
|
39
|
+
- ext/llama_cpp/src/ggml-impl.h
|
39
40
|
- ext/llama_cpp/src/ggml-metal.h
|
40
41
|
- ext/llama_cpp/src/ggml-metal.m
|
41
42
|
- ext/llama_cpp/src/ggml-metal.metal
|
@@ -43,10 +44,10 @@ files:
|
|
43
44
|
- ext/llama_cpp/src/ggml-mpi.h
|
44
45
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
45
46
|
- ext/llama_cpp/src/ggml-opencl.h
|
47
|
+
- ext/llama_cpp/src/ggml-quants.c
|
48
|
+
- ext/llama_cpp/src/ggml-quants.h
|
46
49
|
- ext/llama_cpp/src/ggml.c
|
47
50
|
- ext/llama_cpp/src/ggml.h
|
48
|
-
- ext/llama_cpp/src/k_quants.c
|
49
|
-
- ext/llama_cpp/src/k_quants.h
|
50
51
|
- ext/llama_cpp/src/llama-util.h
|
51
52
|
- ext/llama_cpp/src/llama.cpp
|
52
53
|
- ext/llama_cpp/src/llama.h
|