llama_cpp 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +228 -165
- data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +71 -42
- data/ext/llama_cpp/src/ggml-metal.metal +171 -35
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1303 -3419
- data/ext/llama_cpp/src/ggml.h +33 -11
- data/ext/llama_cpp/src/llama.cpp +1925 -2655
- data/ext/llama_cpp/src/llama.h +48 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +34 -14
- metadata +5 -4
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -106,6 +106,14 @@ extern "C" {
|
|
106
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
107
107
|
};
|
108
108
|
|
109
|
+
enum llama_rope_scaling_type {
|
110
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
111
|
+
LLAMA_ROPE_SCALING_NONE = 0,
|
112
|
+
LLAMA_ROPE_SCALING_LINEAR = 1,
|
113
|
+
LLAMA_ROPE_SCALING_YARN = 2,
|
114
|
+
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
115
|
+
};
|
116
|
+
|
109
117
|
typedef struct llama_token_data {
|
110
118
|
llama_token id; // token id
|
111
119
|
float logit; // log-odds of the token
|
@@ -172,13 +180,19 @@ extern "C" {
|
|
172
180
|
uint32_t n_batch; // prompt processing maximum batch size
|
173
181
|
uint32_t n_threads; // number of threads to use for generation
|
174
182
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
183
|
+
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
175
184
|
|
176
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
177
|
-
float
|
178
|
-
float
|
186
|
+
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
|
+
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
189
|
+
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
|
+
float yarn_beta_fast; // YaRN low correction dim
|
191
|
+
float yarn_beta_slow; // YaRN high correction dim
|
192
|
+
uint32_t yarn_orig_ctx; // YaRN original context size
|
179
193
|
|
180
194
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
181
|
-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
195
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
182
196
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
183
197
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
184
198
|
bool embedding; // embedding mode only
|
@@ -191,6 +205,7 @@ extern "C" {
|
|
191
205
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
192
206
|
bool quantize_output_tensor; // quantize output.weight
|
193
207
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
208
|
+
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
194
209
|
} llama_model_quantize_params;
|
195
210
|
|
196
211
|
// grammar types
|
@@ -333,17 +348,14 @@ extern "C" {
|
|
333
348
|
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
334
349
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
335
350
|
|
336
|
-
//
|
337
|
-
|
338
|
-
|
339
|
-
LLAMA_API void llama_kv_cache_tokens_rm(
|
340
|
-
struct llama_context * ctx,
|
341
|
-
int32_t c0,
|
342
|
-
int32_t c1);
|
351
|
+
// Clear the KV cache
|
352
|
+
LLAMA_API void llama_kv_cache_clear(
|
353
|
+
struct llama_context * ctx);
|
343
354
|
|
344
355
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
345
|
-
//
|
346
|
-
//
|
356
|
+
// seq_id < 0 : match any sequence
|
357
|
+
// p0 < 0 : [0, p1]
|
358
|
+
// p1 < 0 : [p0, inf)
|
347
359
|
LLAMA_API void llama_kv_cache_seq_rm(
|
348
360
|
struct llama_context * ctx,
|
349
361
|
llama_seq_id seq_id,
|
@@ -494,21 +506,22 @@ extern "C" {
|
|
494
506
|
// Vocab
|
495
507
|
//
|
496
508
|
|
497
|
-
LLAMA_API const char * llama_token_get_text(const struct
|
509
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
498
510
|
|
499
|
-
LLAMA_API float llama_token_get_score(const struct
|
511
|
+
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
500
512
|
|
501
|
-
LLAMA_API enum llama_token_type llama_token_get_type(const struct
|
513
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
502
514
|
|
503
515
|
// Special tokens
|
504
|
-
LLAMA_API llama_token llama_token_bos(const struct
|
505
|
-
LLAMA_API llama_token llama_token_eos(const struct
|
506
|
-
LLAMA_API llama_token llama_token_nl (const struct
|
516
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
517
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
518
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
519
|
+
|
507
520
|
// codellama infill tokens
|
508
|
-
LLAMA_API llama_token llama_token_prefix(const struct
|
509
|
-
LLAMA_API llama_token llama_token_middle(const struct
|
510
|
-
LLAMA_API llama_token llama_token_suffix(const struct
|
511
|
-
LLAMA_API llama_token llama_token_eot (const struct
|
521
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
522
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
523
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
524
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
512
525
|
|
513
526
|
//
|
514
527
|
// Tokenization
|
@@ -560,21 +573,15 @@ extern "C" {
|
|
560
573
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
561
574
|
|
562
575
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
563
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
564
|
-
struct llama_context * ctx,
|
565
|
-
llama_token_data_array * candidates,
|
566
|
-
const llama_token * last_tokens,
|
567
|
-
size_t last_tokens_size,
|
568
|
-
float penalty);
|
569
|
-
|
570
576
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
571
|
-
LLAMA_API void
|
577
|
+
LLAMA_API void llama_sample_repetition_penalties(
|
572
578
|
struct llama_context * ctx,
|
573
579
|
llama_token_data_array * candidates,
|
574
580
|
const llama_token * last_tokens,
|
575
|
-
size_t
|
576
|
-
float
|
577
|
-
float
|
581
|
+
size_t penalty_last_n,
|
582
|
+
float penalty_repeat,
|
583
|
+
float penalty_freq,
|
584
|
+
float penalty_present);
|
578
585
|
|
579
586
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
580
587
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
@@ -605,6 +612,13 @@ extern "C" {
|
|
605
612
|
float p,
|
606
613
|
size_t min_keep);
|
607
614
|
|
615
|
+
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
616
|
+
LLAMA_API void llama_sample_min_p(
|
617
|
+
struct llama_context * ctx,
|
618
|
+
llama_token_data_array * candidates,
|
619
|
+
float p,
|
620
|
+
size_t min_keep);
|
621
|
+
|
608
622
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
609
623
|
LLAMA_API void llama_sample_tail_free(
|
610
624
|
struct llama_context * ctx,
|
@@ -663,6 +677,7 @@ extern "C" {
|
|
663
677
|
float * mu);
|
664
678
|
|
665
679
|
/// @details Selects the token with the highest probability.
|
680
|
+
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
666
681
|
LLAMA_API llama_token llama_sample_token_greedy(
|
667
682
|
struct llama_context * ctx,
|
668
683
|
llama_token_data_array * candidates);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.9.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1472'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
|
|
67
67
|
|
68
68
|
# apply penalties
|
69
69
|
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.
|
71
|
-
|
72
|
-
|
70
|
+
context.sample_repetition_penalties(
|
71
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
+
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
73
|
)
|
74
74
|
|
75
75
|
# temperature sampling
|
@@ -97,7 +97,7 @@ module LLaMACpp
|
|
97
97
|
|
98
98
|
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
99
|
|
100
|
-
break if !embd.empty? && embd[-1] == context.token_eos
|
100
|
+
break if !embd.empty? && embd[-1] == context.model.token_eos
|
101
101
|
end
|
102
102
|
|
103
103
|
output.join.scrub('?').strip.delete_prefix(prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -31,6 +31,12 @@ module LLaMACpp
|
|
31
31
|
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
32
32
|
LLAMA_GRETYPE_CHAR_ALT: Integer
|
33
33
|
|
34
|
+
LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
|
35
|
+
LLAMA_ROPE_SCALING_NONE: Integer
|
36
|
+
LLAMA_ROPE_SCALING_LINEAR: Integer
|
37
|
+
LLAMA_ROPE_SCALING_YARN: Integer
|
38
|
+
LLAMA_ROPE_SCALING_MAX_VALUE: Integer
|
39
|
+
|
34
40
|
def self?.backend_init: (?numa: bool) -> void
|
35
41
|
def self?.backend_free: () -> void
|
36
42
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -82,6 +88,16 @@ module LLaMACpp
|
|
82
88
|
def desc: () -> String
|
83
89
|
def size: () -> Integer
|
84
90
|
def n_params: () -> Integer
|
91
|
+
def text: (Integer) -> String
|
92
|
+
def score: (Integer) -> Float
|
93
|
+
def type: (Integer) -> Integer
|
94
|
+
def token_bos: () -> Integer
|
95
|
+
def token_eos: () -> Integer
|
96
|
+
def token_nl: () -> Integer
|
97
|
+
def token_prefix: () -> Integer
|
98
|
+
def token_middle: () -> Integer
|
99
|
+
def token_suffix: () -> Integer
|
100
|
+
def token_eot: () -> Integer
|
85
101
|
end
|
86
102
|
|
87
103
|
class Timings
|
@@ -143,16 +159,6 @@ module LLaMACpp
|
|
143
159
|
|
144
160
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
145
161
|
def embeddings: () -> Array[Float]
|
146
|
-
def text: (Integer) -> String
|
147
|
-
def score: (Integer) -> Float
|
148
|
-
def type: (Integer) -> Integer
|
149
|
-
def token_bos: () -> Integer
|
150
|
-
def token_eos: () -> Integer
|
151
|
-
def token_nl: () -> Integer
|
152
|
-
def token_prefix: () -> Integer
|
153
|
-
def token_middle: () -> Integer
|
154
|
-
def token_suffix: () -> Integer
|
155
|
-
def token_eot: () -> Integer
|
156
162
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
157
163
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
158
164
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -162,20 +168,20 @@ module LLaMACpp
|
|
162
168
|
def print_timings: () -> void
|
163
169
|
def reset_timings: () -> void
|
164
170
|
def kv_cache_token_count: () -> Integer
|
165
|
-
def
|
171
|
+
def kv_cache_clear: () -> void
|
166
172
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
167
173
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
168
174
|
def kv_cache_seq_keep: (Integer) -> void
|
169
|
-
def kv_cache_seq_shift: (Integer, Integer,
|
175
|
+
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
170
176
|
def set_rng_seed: (Integer) -> void
|
171
177
|
def load_session_file: (session_path: String) -> void
|
172
178
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
173
|
-
def
|
174
|
-
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
179
|
+
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
175
180
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
176
181
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
177
182
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
178
183
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
184
|
+
def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
179
185
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
180
186
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
181
187
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
@@ -201,10 +207,22 @@ module LLaMACpp
|
|
201
207
|
def n_threads=: (Integer) -> Integer
|
202
208
|
def n_threads_batch: () -> Integer
|
203
209
|
def n_threads_batch=: (Integer) -> Integer
|
210
|
+
def rope_scaling_type=: (Integer) -> Integer
|
211
|
+
def rope_scaling_type: () -> Integer
|
204
212
|
def rope_freq_base=: (Float) -> Float
|
205
213
|
def rope_freq_base: () -> Float
|
206
214
|
def rope_freq_scale=: (Float) -> Float
|
207
215
|
def rope_freq_scale: () -> Float
|
216
|
+
def yarn_ext_factor=: (Float) -> Float
|
217
|
+
def yarn_ext_factor: () -> Float
|
218
|
+
def yarn_attn_factor=: (Float) -> Float
|
219
|
+
def yarn_attn_factor: () -> Float
|
220
|
+
def yarn_beta_fast=: (Float) -> Float
|
221
|
+
def yarn_beta_fast: () -> Float
|
222
|
+
def yarn_beta_slow=: (Float) -> Float
|
223
|
+
def yarn_beta_slow: () -> Float
|
224
|
+
def yarn_orig_ctx=: (Integer) -> Integer
|
225
|
+
def yarn_orig_ctx: () -> Integer
|
208
226
|
def mul_mat_q: () -> bool
|
209
227
|
def mul_mat_q=: (bool) -> bool
|
210
228
|
def f16_kv: () -> bool
|
@@ -228,6 +246,8 @@ module LLaMACpp
|
|
228
246
|
def quantize_output_tensor=: (bool) -> bool
|
229
247
|
def only_copy: () -> bool
|
230
248
|
def only_copy=: (bool) -> bool
|
249
|
+
def pure: () -> bool
|
250
|
+
def pure=: (bool) -> bool
|
231
251
|
end
|
232
252
|
|
233
253
|
class Params = ContextParams
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- ext/llama_cpp/src/ggml-backend.h
|
37
37
|
- ext/llama_cpp/src/ggml-cuda.cu
|
38
38
|
- ext/llama_cpp/src/ggml-cuda.h
|
39
|
+
- ext/llama_cpp/src/ggml-impl.h
|
39
40
|
- ext/llama_cpp/src/ggml-metal.h
|
40
41
|
- ext/llama_cpp/src/ggml-metal.m
|
41
42
|
- ext/llama_cpp/src/ggml-metal.metal
|
@@ -43,10 +44,10 @@ files:
|
|
43
44
|
- ext/llama_cpp/src/ggml-mpi.h
|
44
45
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
45
46
|
- ext/llama_cpp/src/ggml-opencl.h
|
47
|
+
- ext/llama_cpp/src/ggml-quants.c
|
48
|
+
- ext/llama_cpp/src/ggml-quants.h
|
46
49
|
- ext/llama_cpp/src/ggml.c
|
47
50
|
- ext/llama_cpp/src/ggml.h
|
48
|
-
- ext/llama_cpp/src/k_quants.c
|
49
|
-
- ext/llama_cpp/src/k_quants.h
|
50
51
|
- ext/llama_cpp/src/llama-util.h
|
51
52
|
- ext/llama_cpp/src/llama.cpp
|
52
53
|
- ext/llama_cpp/src/llama.h
|