llama_cpp 0.8.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +228 -165
- data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +71 -42
- data/ext/llama_cpp/src/ggml-metal.metal +171 -35
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1303 -3419
- data/ext/llama_cpp/src/ggml.h +33 -11
- data/ext/llama_cpp/src/llama.cpp +1925 -2655
- data/ext/llama_cpp/src/llama.h +48 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +34 -14
- metadata +5 -4
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -106,6 +106,14 @@ extern "C" {
|
|
106
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
107
107
|
};
|
108
108
|
|
109
|
+
enum llama_rope_scaling_type {
|
110
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
111
|
+
LLAMA_ROPE_SCALING_NONE = 0,
|
112
|
+
LLAMA_ROPE_SCALING_LINEAR = 1,
|
113
|
+
LLAMA_ROPE_SCALING_YARN = 2,
|
114
|
+
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
115
|
+
};
|
116
|
+
|
109
117
|
typedef struct llama_token_data {
|
110
118
|
llama_token id; // token id
|
111
119
|
float logit; // log-odds of the token
|
@@ -172,13 +180,19 @@ extern "C" {
|
|
172
180
|
uint32_t n_batch; // prompt processing maximum batch size
|
173
181
|
uint32_t n_threads; // number of threads to use for generation
|
174
182
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
183
|
+
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
175
184
|
|
176
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
177
|
-
float
|
178
|
-
float
|
186
|
+
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
|
+
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
189
|
+
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
|
+
float yarn_beta_fast; // YaRN low correction dim
|
191
|
+
float yarn_beta_slow; // YaRN high correction dim
|
192
|
+
uint32_t yarn_orig_ctx; // YaRN original context size
|
179
193
|
|
180
194
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
181
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
195
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
182
196
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
183
197
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
184
198
|
bool embedding; // embedding mode only
|
@@ -191,6 +205,7 @@ extern "C" {
|
|
191
205
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
192
206
|
bool quantize_output_tensor; // quantize output.weight
|
193
207
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
208
|
+
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
194
209
|
} llama_model_quantize_params;
|
195
210
|
|
196
211
|
// grammar types
|
@@ -333,17 +348,14 @@ extern "C" {
|
|
333
348
|
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
334
349
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
335
350
|
|
336
|
-
//
|
337
|
-
|
338
|
-
|
339
|
-
LLAMA_API void llama_kv_cache_tokens_rm(
|
340
|
-
struct llama_context * ctx,
|
341
|
-
int32_t c0,
|
342
|
-
int32_t c1);
|
351
|
+
// Clear the KV cache
|
352
|
+
LLAMA_API void llama_kv_cache_clear(
|
353
|
+
struct llama_context * ctx);
|
343
354
|
|
344
355
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
345
|
-
//
|
346
|
-
//
|
356
|
+
// seq_id < 0 : match any sequence
|
357
|
+
// p0 < 0 : [0, p1]
|
358
|
+
// p1 < 0 : [p0, inf)
|
347
359
|
LLAMA_API void llama_kv_cache_seq_rm(
|
348
360
|
struct llama_context * ctx,
|
349
361
|
llama_seq_id seq_id,
|
@@ -494,21 +506,22 @@ extern "C" {
|
|
494
506
|
// Vocab
|
495
507
|
//
|
496
508
|
|
497
|
-
LLAMA_API const char * llama_token_get_text(const struct
|
509
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
498
510
|
|
499
|
-
LLAMA_API float llama_token_get_score(const struct
|
511
|
+
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
500
512
|
|
501
|
-
LLAMA_API enum llama_token_type llama_token_get_type(const struct
|
513
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
502
514
|
|
503
515
|
// Special tokens
|
504
|
-
LLAMA_API llama_token llama_token_bos(const struct
|
505
|
-
LLAMA_API llama_token llama_token_eos(const struct
|
506
|
-
LLAMA_API llama_token llama_token_nl (const struct
|
516
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
517
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
518
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
519
|
+
|
507
520
|
// codellama infill tokens
|
508
|
-
LLAMA_API llama_token llama_token_prefix(const struct
|
509
|
-
LLAMA_API llama_token llama_token_middle(const struct
|
510
|
-
LLAMA_API llama_token llama_token_suffix(const struct
|
511
|
-
LLAMA_API llama_token llama_token_eot (const struct
|
521
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
522
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
523
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
524
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
512
525
|
|
513
526
|
//
|
514
527
|
// Tokenization
|
@@ -560,21 +573,15 @@ extern "C" {
|
|
560
573
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
561
574
|
|
562
575
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
563
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
564
|
-
struct llama_context * ctx,
|
565
|
-
llama_token_data_array * candidates,
|
566
|
-
const llama_token * last_tokens,
|
567
|
-
size_t last_tokens_size,
|
568
|
-
float penalty);
|
569
|
-
|
570
576
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
571
|
-
LLAMA_API void
|
577
|
+
LLAMA_API void llama_sample_repetition_penalties(
|
572
578
|
struct llama_context * ctx,
|
573
579
|
llama_token_data_array * candidates,
|
574
580
|
const llama_token * last_tokens,
|
575
|
-
size_t
|
576
|
-
float
|
577
|
-
float
|
581
|
+
size_t penalty_last_n,
|
582
|
+
float penalty_repeat,
|
583
|
+
float penalty_freq,
|
584
|
+
float penalty_present);
|
578
585
|
|
579
586
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
580
587
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
@@ -605,6 +612,13 @@ extern "C" {
|
|
605
612
|
float p,
|
606
613
|
size_t min_keep);
|
607
614
|
|
615
|
+
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
616
|
+
LLAMA_API void llama_sample_min_p(
|
617
|
+
struct llama_context * ctx,
|
618
|
+
llama_token_data_array * candidates,
|
619
|
+
float p,
|
620
|
+
size_t min_keep);
|
621
|
+
|
608
622
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
609
623
|
LLAMA_API void llama_sample_tail_free(
|
610
624
|
struct llama_context * ctx,
|
@@ -663,6 +677,7 @@ extern "C" {
|
|
663
677
|
float * mu);
|
664
678
|
|
665
679
|
/// @details Selects the token with the highest probability.
|
680
|
+
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
666
681
|
LLAMA_API llama_token llama_sample_token_greedy(
|
667
682
|
struct llama_context * ctx,
|
668
683
|
llama_token_data_array * candidates);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.9.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1472'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
|
|
67
67
|
|
68
68
|
# apply penalties
|
69
69
|
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.
|
71
|
-
|
72
|
-
|
70
|
+
context.sample_repetition_penalties(
|
71
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
+
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
73
|
)
|
74
74
|
|
75
75
|
# temperature sampling
|
@@ -97,7 +97,7 @@ module LLaMACpp
|
|
97
97
|
|
98
98
|
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
99
|
|
100
|
-
break if !embd.empty? && embd[-1] == context.token_eos
|
100
|
+
break if !embd.empty? && embd[-1] == context.model.token_eos
|
101
101
|
end
|
102
102
|
|
103
103
|
output.join.scrub('?').strip.delete_prefix(prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
|
|
31
31
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
32
32
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
33
33
|
|
34
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
|
35
|
+
LLAMA_ROPE_SCALING_NONE: Integer
|
36
|
+
LLAMA_ROPE_SCALING_LINEAR: Integer
|
37
|
+
LLAMA_ROPE_SCALING_YARN: Integer
|
38
|
+
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
39
|
+
|
34
40
|
def self?.backend_init: (?numa: bool) -> void
|
35
41
|
def self?.backend_free: () -> void
|
36
42
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -82,6 +88,16 @@ module LLaMACpp
|
|
82
88
|
def desc: () -> String
|
83
89
|
def size: () -> Integer
|
84
90
|
def n_params: () -> Integer
|
91
|
+
def text: (Integer) -> String
|
92
|
+
def score: (Integer) -> Float
|
93
|
+
def type: (Integer) -> Integer
|
94
|
+
def token_bos: () -> Integer
|
95
|
+
def token_eos: () -> Integer
|
96
|
+
def token_nl: () -> Integer
|
97
|
+
def token_prefix: () -> Integer
|
98
|
+
def token_middle: () -> Integer
|
99
|
+
def token_suffix: () -> Integer
|
100
|
+
def token_eot: () -> Integer
|
85
101
|
end
|
86
102
|
|
87
103
|
class Timings
|
@@ -143,16 +159,6 @@ module LLaMACpp
|
|
143
159
|
|
144
160
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
145
161
|
def embeddings: () -> Array[Float]
|
146
|
-
def text: (Integer) -> String
|
147
|
-
def score: (Integer) -> Float
|
148
|
-
def type: (Integer) -> Integer
|
149
|
-
def token_bos: () -> Integer
|
150
|
-
def token_eos: () -> Integer
|
151
|
-
def token_nl: () -> Integer
|
152
|
-
def token_prefix: () -> Integer
|
153
|
-
def token_middle: () -> Integer
|
154
|
-
def token_suffix: () -> Integer
|
155
|
-
def token_eot: () -> Integer
|
156
162
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
157
163
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
158
164
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -162,20 +168,20 @@ module LLaMACpp
|
|
162
168
|
def print_timings: () -> void
|
163
169
|
def reset_timings: () -> void
|
164
170
|
def kv_cache_token_count: () -> Integer
|
165
|
-
def
|
171
|
+
def kv_cache_clear: () -> void
|
166
172
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
167
173
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
168
174
|
def kv_cache_seq_keep: (Integer) -> void
|
169
|
-
def kv_cache_seq_shift: (Integer, Integer,
|
175
|
+
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
170
176
|
def set_rng_seed: (Integer) -> void
|
171
177
|
def load_session_file: (session_path: String) -> void
|
172
178
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
173
|
-
def
|
174
|
-
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
179
|
+
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
175
180
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
176
181
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
177
182
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
178
183
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
184
|
+
def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
179
185
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
180
186
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
181
187
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
@@ -201,10 +207,22 @@ module LLaMACpp
|
|
201
207
|
def n_threads=: (Integer) -> Integer
|
202
208
|
def n_threads_batch: () -> Integer
|
203
209
|
def n_threads_batch=: (Integer) -> Integer
|
210
|
+
def rope_scaling_type=: (Integer) -> Integer
|
211
|
+
def rope_scaling_type: () -> Integer
|
204
212
|
def rope_freq_base=: (Float) -> Float
|
205
213
|
def rope_freq_base: () -> Float
|
206
214
|
def rope_freq_scale=: (Float) -> Float
|
207
215
|
def rope_freq_scale: () -> Float
|
216
|
+
def yarn_ext_factor=: (Float) -> Float
|
217
|
+
def yarn_ext_factor: () -> Float
|
218
|
+
def yarn_attn_factor=: (Float) -> Float
|
219
|
+
def yarn_attn_factor: () -> Float
|
220
|
+
def yarn_beta_fast=: (Float) -> Float
|
221
|
+
def yarn_beta_fast: () -> Float
|
222
|
+
def yarn_beta_slow=: (Float) -> Float
|
223
|
+
def yarn_beta_slow: () -> Float
|
224
|
+
def yarn_orig_ctx=: (Integer) -> Integer
|
225
|
+
def yarn_orig_ctx: () -> Integer
|
208
226
|
def mul_mat_q: () -> bool
|
209
227
|
def mul_mat_q=: (bool) -> bool
|
210
228
|
def f16_kv: () -> bool
|
@@ -228,6 +246,8 @@ module LLaMACpp
|
|
228
246
|
def quantize_output_tensor=: (bool) -> bool
|
229
247
|
def only_copy: () -> bool
|
230
248
|
def only_copy=: (bool) -> bool
|
249
|
+
def pure: () -> bool
|
250
|
+
def pure=: (bool) -> bool
|
231
251
|
end
|
232
252
|
|
233
253
|
class Params = ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- ext/llama_cpp/src/ggml-backend.h
|
37
37
|
- ext/llama_cpp/src/ggml-cuda.cu
|
38
38
|
- ext/llama_cpp/src/ggml-cuda.h
|
39
|
+
- ext/llama_cpp/src/ggml-impl.h
|
39
40
|
- ext/llama_cpp/src/ggml-metal.h
|
40
41
|
- ext/llama_cpp/src/ggml-metal.m
|
41
42
|
- ext/llama_cpp/src/ggml-metal.metal
|
@@ -43,10 +44,10 @@ files:
|
|
43
44
|
- ext/llama_cpp/src/ggml-mpi.h
|
44
45
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
45
46
|
- ext/llama_cpp/src/ggml-opencl.h
|
47
|
+
- ext/llama_cpp/src/ggml-quants.c
|
48
|
+
- ext/llama_cpp/src/ggml-quants.h
|
46
49
|
- ext/llama_cpp/src/ggml.c
|
47
50
|
- ext/llama_cpp/src/ggml.h
|
48
|
-
- ext/llama_cpp/src/k_quants.c
|
49
|
-
- ext/llama_cpp/src/k_quants.h
|
50
51
|
- ext/llama_cpp/src/llama-util.h
|
51
52
|
- ext/llama_cpp/src/llama.cpp
|
52
53
|
- ext/llama_cpp/src/llama.h
|