llama_cpp 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -106,6 +106,14 @@ extern "C" {
|
|
106
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
107
107
|
};
|
108
108
|
|
109
|
+
enum llama_rope_scaling_type {
|
110
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
111
|
+
LLAMA_ROPE_SCALING_NONE = 0,
|
112
|
+
LLAMA_ROPE_SCALING_LINEAR = 1,
|
113
|
+
LLAMA_ROPE_SCALING_YARN = 2,
|
114
|
+
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
115
|
+
};
|
116
|
+
|
109
117
|
typedef struct llama_token_data {
|
110
118
|
llama_token id; // token id
|
111
119
|
float logit; // log-odds of the token
|
@@ -172,13 +180,19 @@ extern "C" {
|
|
172
180
|
uint32_t n_batch; // prompt processing maximum batch size
|
173
181
|
uint32_t n_threads; // number of threads to use for generation
|
174
182
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
183
|
+
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
175
184
|
|
176
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
177
|
-
float
|
178
|
-
float
|
186
|
+
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
|
+
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
189
|
+
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
|
+
float yarn_beta_fast; // YaRN low correction dim
|
191
|
+
float yarn_beta_slow; // YaRN high correction dim
|
192
|
+
uint32_t yarn_orig_ctx; // YaRN original context size
|
179
193
|
|
180
194
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
181
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
195
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
182
196
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
183
197
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
184
198
|
bool embedding; // embedding mode only
|
@@ -191,6 +205,7 @@ extern "C" {
|
|
191
205
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
192
206
|
bool quantize_output_tensor; // quantize output.weight
|
193
207
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
208
|
+
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
194
209
|
} llama_model_quantize_params;
|
195
210
|
|
196
211
|
// grammar types
|
@@ -333,17 +348,14 @@ extern "C" {
|
|
333
348
|
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
334
349
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
335
350
|
|
336
|
-
//
|
337
|
-
|
338
|
-
|
339
|
-
LLAMA_API void llama_kv_cache_tokens_rm(
|
340
|
-
struct llama_context * ctx,
|
341
|
-
int32_t c0,
|
342
|
-
int32_t c1);
|
351
|
+
// Clear the KV cache
|
352
|
+
LLAMA_API void llama_kv_cache_clear(
|
353
|
+
struct llama_context * ctx);
|
343
354
|
|
344
355
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
345
|
-
//
|
346
|
-
//
|
356
|
+
// seq_id < 0 : match any sequence
|
357
|
+
// p0 < 0 : [0, p1]
|
358
|
+
// p1 < 0 : [p0, inf)
|
347
359
|
LLAMA_API void llama_kv_cache_seq_rm(
|
348
360
|
struct llama_context * ctx,
|
349
361
|
llama_seq_id seq_id,
|
@@ -600,6 +612,13 @@ extern "C" {
|
|
600
612
|
float p,
|
601
613
|
size_t min_keep);
|
602
614
|
|
615
|
+
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
616
|
+
LLAMA_API void llama_sample_min_p(
|
617
|
+
struct llama_context * ctx,
|
618
|
+
llama_token_data_array * candidates,
|
619
|
+
float p,
|
620
|
+
size_t min_keep);
|
621
|
+
|
603
622
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
604
623
|
LLAMA_API void llama_sample_tail_free(
|
605
624
|
struct llama_context * ctx,
|
@@ -658,6 +677,7 @@ extern "C" {
|
|
658
677
|
float * mu);
|
659
678
|
|
660
679
|
/// @details Selects the token with the highest probability.
|
680
|
+
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
661
681
|
LLAMA_API llama_token llama_sample_token_greedy(
|
662
682
|
struct llama_context * ctx,
|
663
683
|
llama_token_data_array * candidates);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1472'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
|
|
31
31
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
32
32
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
33
33
|
|
34
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
|
35
|
+
LLAMA_ROPE_SCALING_NONE: Integer
|
36
|
+
LLAMA_ROPE_SCALING_LINEAR: Integer
|
37
|
+
LLAMA_ROPE_SCALING_YARN: Integer
|
38
|
+
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
39
|
+
|
34
40
|
def self?.backend_init: (?numa: bool) -> void
|
35
41
|
def self?.backend_free: () -> void
|
36
42
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -162,11 +168,11 @@ module LLaMACpp
|
|
162
168
|
def print_timings: () -> void
|
163
169
|
def reset_timings: () -> void
|
164
170
|
def kv_cache_token_count: () -> Integer
|
165
|
-
def
|
171
|
+
def kv_cache_clear: () -> void
|
166
172
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
167
173
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
168
174
|
def kv_cache_seq_keep: (Integer) -> void
|
169
|
-
def kv_cache_seq_shift: (Integer, Integer,
|
175
|
+
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
170
176
|
def set_rng_seed: (Integer) -> void
|
171
177
|
def load_session_file: (session_path: String) -> void
|
172
178
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -175,6 +181,7 @@ module LLaMACpp
|
|
175
181
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
176
182
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
177
183
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
184
|
+
def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
178
185
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
179
186
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
180
187
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
@@ -200,10 +207,22 @@ module LLaMACpp
|
|
200
207
|
def n_threads=: (Integer) -> Integer
|
201
208
|
def n_threads_batch: () -> Integer
|
202
209
|
def n_threads_batch=: (Integer) -> Integer
|
210
|
+
def rope_scaling_type=: (Integer) -> Integer
|
211
|
+
def rope_scaling_type: () -> Integer
|
203
212
|
def rope_freq_base=: (Float) -> Float
|
204
213
|
def rope_freq_base: () -> Float
|
205
214
|
def rope_freq_scale=: (Float) -> Float
|
206
215
|
def rope_freq_scale: () -> Float
|
216
|
+
def yarn_ext_factor=: (Float) -> Float
|
217
|
+
def yarn_ext_factor: () -> Float
|
218
|
+
def yarn_attn_factor=: (Float) -> Float
|
219
|
+
def yarn_attn_factor: () -> Float
|
220
|
+
def yarn_beta_fast=: (Float) -> Float
|
221
|
+
def yarn_beta_fast: () -> Float
|
222
|
+
def yarn_beta_slow=: (Float) -> Float
|
223
|
+
def yarn_beta_slow: () -> Float
|
224
|
+
def yarn_orig_ctx=: (Integer) -> Integer
|
225
|
+
def yarn_orig_ctx: () -> Integer
|
207
226
|
def mul_mat_q: () -> bool
|
208
227
|
def mul_mat_q=: (bool) -> bool
|
209
228
|
def f16_kv: () -> bool
|
@@ -227,6 +246,8 @@ module LLaMACpp
|
|
227
246
|
def quantize_output_tensor=: (bool) -> bool
|
228
247
|
def only_copy: () -> bool
|
229
248
|
def only_copy=: (bool) -> bool
|
249
|
+
def pure: () -> bool
|
250
|
+
def pure=: (bool) -> bool
|
230
251
|
end
|
231
252
|
|
232
253
|
class Params = ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- ext/llama_cpp/src/ggml-backend.h
|
37
37
|
- ext/llama_cpp/src/ggml-cuda.cu
|
38
38
|
- ext/llama_cpp/src/ggml-cuda.h
|
39
|
+
- ext/llama_cpp/src/ggml-impl.h
|
39
40
|
- ext/llama_cpp/src/ggml-metal.h
|
40
41
|
- ext/llama_cpp/src/ggml-metal.m
|
41
42
|
- ext/llama_cpp/src/ggml-metal.metal
|
@@ -43,10 +44,10 @@ files:
|
|
43
44
|
- ext/llama_cpp/src/ggml-mpi.h
|
44
45
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
45
46
|
- ext/llama_cpp/src/ggml-opencl.h
|
47
|
+
- ext/llama_cpp/src/ggml-quants.c
|
48
|
+
- ext/llama_cpp/src/ggml-quants.h
|
46
49
|
- ext/llama_cpp/src/ggml.c
|
47
50
|
- ext/llama_cpp/src/ggml.h
|
48
|
-
- ext/llama_cpp/src/k_quants.c
|
49
|
-
- ext/llama_cpp/src/k_quants.h
|
50
51
|
- ext/llama_cpp/src/llama-util.h
|
51
52
|
- ext/llama_cpp/src/llama.cpp
|
52
53
|
- ext/llama_cpp/src/llama.h
|