llama_cpp 0.3.3 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +439 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +250 -111
- data/ext/llama_cpp/src/ggml-metal.metal +614 -483
- data/ext/llama_cpp/src/ggml.c +793 -1032
- data/ext/llama_cpp/src/ggml.h +95 -18
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +626 -166
- data/ext/llama_cpp/src/llama.h +94 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +36 -1
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -53,6 +53,10 @@
|
|
53
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
54
|
#endif
|
55
55
|
|
56
|
+
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
+
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
+
#endif
|
59
|
+
|
56
60
|
#ifdef __cplusplus
|
57
61
|
extern "C" {
|
58
62
|
#endif
|
@@ -83,12 +87,20 @@ extern "C" {
|
|
83
87
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
84
88
|
|
85
89
|
struct llama_context_params {
|
86
|
-
uint32_t seed;
|
87
|
-
int32_t n_ctx;
|
88
|
-
int32_t n_batch;
|
89
|
-
int32_t
|
90
|
-
|
91
|
-
|
90
|
+
uint32_t seed; // RNG seed, -1 for random
|
91
|
+
int32_t n_ctx; // text context
|
92
|
+
int32_t n_batch; // prompt processing batch size
|
93
|
+
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
94
|
+
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
95
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
96
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
97
|
+
|
98
|
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
99
|
+
|
100
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
101
|
+
float rope_freq_base; // RoPE base frequency
|
102
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
103
|
+
|
92
104
|
// called with a progress value between 0 and 1, pass NULL to disable
|
93
105
|
llama_progress_callback progress_callback;
|
94
106
|
// context pointer passed to the progress callback
|
@@ -134,6 +146,40 @@ extern "C" {
|
|
134
146
|
bool quantize_output_tensor; // quantize output.weight
|
135
147
|
} llama_model_quantize_params;
|
136
148
|
|
149
|
+
// grammar types
|
150
|
+
struct llama_grammar;
|
151
|
+
|
152
|
+
// grammar element type
|
153
|
+
enum llama_gretype {
|
154
|
+
// end of rule definition
|
155
|
+
LLAMA_GRETYPE_END = 0,
|
156
|
+
|
157
|
+
// start of alternate definition for rule
|
158
|
+
LLAMA_GRETYPE_ALT = 1,
|
159
|
+
|
160
|
+
// non-terminal element: reference to rule
|
161
|
+
LLAMA_GRETYPE_RULE_REF = 2,
|
162
|
+
|
163
|
+
// terminal element: character (code point)
|
164
|
+
LLAMA_GRETYPE_CHAR = 3,
|
165
|
+
|
166
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
167
|
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
168
|
+
|
169
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
170
|
+
// be an inclusive range ([a-z])
|
171
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
172
|
+
|
173
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
174
|
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
175
|
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
176
|
+
};
|
177
|
+
|
178
|
+
typedef struct llama_grammar_element {
|
179
|
+
enum llama_gretype type;
|
180
|
+
uint32_t value; // Unicode code point or rule ID
|
181
|
+
} llama_grammar_element;
|
182
|
+
|
137
183
|
// performance timing information
|
138
184
|
struct llama_timings {
|
139
185
|
double t_start_ms;
|
@@ -148,6 +194,8 @@ extern "C" {
|
|
148
194
|
int32_t n_eval;
|
149
195
|
};
|
150
196
|
|
197
|
+
LLAMA_API int llama_max_devices();
|
198
|
+
|
151
199
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
152
200
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
153
201
|
|
@@ -270,10 +318,21 @@ extern "C" {
|
|
270
318
|
int n_max_tokens,
|
271
319
|
bool add_bos);
|
272
320
|
|
321
|
+
LLAMA_API int llama_tokenize_with_model(
|
322
|
+
const struct llama_model * model,
|
323
|
+
const char * text,
|
324
|
+
llama_token * tokens,
|
325
|
+
int n_max_tokens,
|
326
|
+
bool add_bos);
|
327
|
+
|
273
328
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
274
329
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
275
330
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
276
331
|
|
332
|
+
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
333
|
+
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
334
|
+
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
335
|
+
|
277
336
|
// Get the vocabulary as output parameters.
|
278
337
|
// Returns number of results.
|
279
338
|
LLAMA_API int llama_get_vocab(
|
@@ -282,6 +341,12 @@ extern "C" {
|
|
282
341
|
float * scores,
|
283
342
|
int capacity);
|
284
343
|
|
344
|
+
LLAMA_API int llama_get_vocab_from_model(
|
345
|
+
const struct llama_model * model,
|
346
|
+
const char * * strings,
|
347
|
+
float * scores,
|
348
|
+
int capacity);
|
349
|
+
|
285
350
|
// Token logits obtained from the last call to llama_eval()
|
286
351
|
// The logits for the last token are stored in the last row
|
287
352
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -294,13 +359,28 @@ extern "C" {
|
|
294
359
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
295
360
|
|
296
361
|
// Token Id -> String. Uses the vocabulary in the provided context
|
297
|
-
LLAMA_API const char * llama_token_to_str(
|
362
|
+
LLAMA_API const char * llama_token_to_str(
|
363
|
+
const struct llama_context * ctx,
|
364
|
+
llama_token token);
|
365
|
+
|
366
|
+
LLAMA_API const char * llama_token_to_str_with_model(
|
367
|
+
const struct llama_model * model,
|
368
|
+
llama_token token);
|
298
369
|
|
299
370
|
// Special tokens
|
300
371
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
301
372
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
302
373
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
303
374
|
|
375
|
+
// Grammar
|
376
|
+
//
|
377
|
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
378
|
+
const llama_grammar_element ** rules,
|
379
|
+
size_t n_rules,
|
380
|
+
size_t start_rule_index);
|
381
|
+
|
382
|
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
383
|
+
|
304
384
|
// Sampling functions
|
305
385
|
|
306
386
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
@@ -313,13 +393,11 @@ extern "C" {
|
|
313
393
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
314
394
|
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
315
395
|
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
316
|
-
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
317
396
|
LLAMA_API void llama_sample_classifier_free_guidance(
|
318
397
|
struct llama_context * ctx,
|
319
398
|
llama_token_data_array * candidates,
|
320
399
|
struct llama_context * guidance_ctx,
|
321
|
-
float scale
|
322
|
-
float smooth_factor);
|
400
|
+
float scale);
|
323
401
|
|
324
402
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
325
403
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
@@ -337,6 +415,9 @@ extern "C" {
|
|
337
415
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
338
416
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
339
417
|
|
418
|
+
/// @details Apply constraints from grammar
|
419
|
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
420
|
+
|
340
421
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
341
422
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
342
423
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -358,6 +439,9 @@ extern "C" {
|
|
358
439
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
359
440
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
360
441
|
|
442
|
+
/// @details Accepts the sampled token into the grammar
|
443
|
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
444
|
+
|
361
445
|
// Performance information
|
362
446
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
363
447
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-1a94186'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
+
LLAMA_GRETYPE_END: Integer
|
30
|
+
LLAMA_GRETYPE_ALT: Integer
|
31
|
+
LLAMA_GRETYPE_RULE_REF: Integer
|
32
|
+
LLAMA_GRETYPE_CHAR: Integer
|
33
|
+
LLAMA_GRETYPE_CHAR_NOT: Integer
|
34
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
35
|
+
LLAMA_GRETYPE_CHAR_ALT: Integer
|
36
|
+
|
29
37
|
def self?.backend_init: (?numa: bool) -> void
|
30
38
|
def self?.backend_free: () -> void
|
31
39
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -39,6 +47,7 @@ module LLaMACpp
|
|
39
47
|
def self?.token_nl: () -> Integer
|
40
48
|
def self?.mmap_supported?: () -> bool
|
41
49
|
def self?.mlock_supported?: () -> bool
|
50
|
+
def self?.max_devices: () -> Integer
|
42
51
|
|
43
52
|
class TokenData
|
44
53
|
public
|
@@ -69,6 +78,12 @@ module LLaMACpp
|
|
69
78
|
def free: () -> void
|
70
79
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
71
80
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
81
|
+
def n_vocab: () -> Integer
|
82
|
+
def n_ctx: () -> Integer
|
83
|
+
def n_embd: () -> Integer
|
84
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
85
|
+
def token_to_str: (Integer) -> String
|
86
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
72
87
|
end
|
73
88
|
|
74
89
|
class Timings
|
@@ -109,7 +124,7 @@ module LLaMACpp
|
|
109
124
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
110
125
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
111
126
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
112
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float
|
127
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
113
128
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
114
129
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
115
130
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -120,6 +135,8 @@ module LLaMACpp
|
|
120
135
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
121
136
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
122
137
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
138
|
+
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
139
|
+
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
123
140
|
end
|
124
141
|
|
125
142
|
class ContextParams
|
@@ -140,6 +157,10 @@ module LLaMACpp
|
|
140
157
|
def main_gpu: () -> Integer
|
141
158
|
def main_gpu=: (Integer) -> Integer
|
142
159
|
def tensor_split: () -> Array[Float]
|
160
|
+
def rope_freq_base=: (Float) -> Float
|
161
|
+
def rope_freq_base: () -> Float
|
162
|
+
def rope_freq_scale=: (Float) -> Float
|
163
|
+
def rope_freq_scale: () -> Float
|
143
164
|
def low_vram: () -> bool
|
144
165
|
def low_vram=: (bool) -> bool
|
145
166
|
def seed: () -> Integer
|
@@ -166,4 +187,18 @@ module LLaMACpp
|
|
166
187
|
end
|
167
188
|
|
168
189
|
class Params = ContextParams
|
190
|
+
|
191
|
+
class GrammarElement
|
192
|
+
public
|
193
|
+
|
194
|
+
def initialize: (?type: Integer, ?value: Integer) -> void
|
195
|
+
def type: () -> Integer
|
196
|
+
def type=: (Integer) -> Integer
|
197
|
+
def value: () -> Integer
|
198
|
+
def value=: (Integer) -> Integer
|
199
|
+
end
|
200
|
+
|
201
|
+
class Grammar
|
202
|
+
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
203
|
+
end
|
169
204
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|