llama_cpp 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +439 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +250 -111
- data/ext/llama_cpp/src/ggml-metal.metal +614 -483
- data/ext/llama_cpp/src/ggml.c +793 -1032
- data/ext/llama_cpp/src/ggml.h +95 -18
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +626 -166
- data/ext/llama_cpp/src/llama.h +94 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +36 -1
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -53,6 +53,10 @@
|
|
53
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
54
|
#endif
|
55
55
|
|
56
|
+
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
+
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
+
#endif
|
59
|
+
|
56
60
|
#ifdef __cplusplus
|
57
61
|
extern "C" {
|
58
62
|
#endif
|
@@ -83,12 +87,20 @@ extern "C" {
|
|
83
87
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
84
88
|
|
85
89
|
struct llama_context_params {
|
86
|
-
uint32_t seed;
|
87
|
-
int32_t n_ctx;
|
88
|
-
int32_t n_batch;
|
89
|
-
int32_t
|
90
|
-
|
91
|
-
|
90
|
+
uint32_t seed; // RNG seed, -1 for random
|
91
|
+
int32_t n_ctx; // text context
|
92
|
+
int32_t n_batch; // prompt processing batch size
|
93
|
+
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
94
|
+
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
95
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
96
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
97
|
+
|
98
|
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
99
|
+
|
100
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
101
|
+
float rope_freq_base; // RoPE base frequency
|
102
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
103
|
+
|
92
104
|
// called with a progress value between 0 and 1, pass NULL to disable
|
93
105
|
llama_progress_callback progress_callback;
|
94
106
|
// context pointer passed to the progress callback
|
@@ -134,6 +146,40 @@ extern "C" {
|
|
134
146
|
bool quantize_output_tensor; // quantize output.weight
|
135
147
|
} llama_model_quantize_params;
|
136
148
|
|
149
|
+
// grammar types
|
150
|
+
struct llama_grammar;
|
151
|
+
|
152
|
+
// grammar element type
|
153
|
+
enum llama_gretype {
|
154
|
+
// end of rule definition
|
155
|
+
LLAMA_GRETYPE_END = 0,
|
156
|
+
|
157
|
+
// start of alternate definition for rule
|
158
|
+
LLAMA_GRETYPE_ALT = 1,
|
159
|
+
|
160
|
+
// non-terminal element: reference to rule
|
161
|
+
LLAMA_GRETYPE_RULE_REF = 2,
|
162
|
+
|
163
|
+
// terminal element: character (code point)
|
164
|
+
LLAMA_GRETYPE_CHAR = 3,
|
165
|
+
|
166
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
167
|
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
168
|
+
|
169
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
170
|
+
// be an inclusive range ([a-z])
|
171
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
172
|
+
|
173
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
174
|
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
175
|
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
176
|
+
};
|
177
|
+
|
178
|
+
typedef struct llama_grammar_element {
|
179
|
+
enum llama_gretype type;
|
180
|
+
uint32_t value; // Unicode code point or rule ID
|
181
|
+
} llama_grammar_element;
|
182
|
+
|
137
183
|
// performance timing information
|
138
184
|
struct llama_timings {
|
139
185
|
double t_start_ms;
|
@@ -148,6 +194,8 @@ extern "C" {
|
|
148
194
|
int32_t n_eval;
|
149
195
|
};
|
150
196
|
|
197
|
+
LLAMA_API int llama_max_devices();
|
198
|
+
|
151
199
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
152
200
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
153
201
|
|
@@ -270,10 +318,21 @@ extern "C" {
|
|
270
318
|
int n_max_tokens,
|
271
319
|
bool add_bos);
|
272
320
|
|
321
|
+
LLAMA_API int llama_tokenize_with_model(
|
322
|
+
const struct llama_model * model,
|
323
|
+
const char * text,
|
324
|
+
llama_token * tokens,
|
325
|
+
int n_max_tokens,
|
326
|
+
bool add_bos);
|
327
|
+
|
273
328
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
274
329
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
275
330
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
276
331
|
|
332
|
+
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
333
|
+
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
334
|
+
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
335
|
+
|
277
336
|
// Get the vocabulary as output parameters.
|
278
337
|
// Returns number of results.
|
279
338
|
LLAMA_API int llama_get_vocab(
|
@@ -282,6 +341,12 @@ extern "C" {
|
|
282
341
|
float * scores,
|
283
342
|
int capacity);
|
284
343
|
|
344
|
+
LLAMA_API int llama_get_vocab_from_model(
|
345
|
+
const struct llama_model * model,
|
346
|
+
const char * * strings,
|
347
|
+
float * scores,
|
348
|
+
int capacity);
|
349
|
+
|
285
350
|
// Token logits obtained from the last call to llama_eval()
|
286
351
|
// The logits for the last token are stored in the last row
|
287
352
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -294,13 +359,28 @@ extern "C" {
|
|
294
359
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
295
360
|
|
296
361
|
// Token Id -> String. Uses the vocabulary in the provided context
|
297
|
-
LLAMA_API const char * llama_token_to_str(
|
362
|
+
LLAMA_API const char * llama_token_to_str(
|
363
|
+
const struct llama_context * ctx,
|
364
|
+
llama_token token);
|
365
|
+
|
366
|
+
LLAMA_API const char * llama_token_to_str_with_model(
|
367
|
+
const struct llama_model * model,
|
368
|
+
llama_token token);
|
298
369
|
|
299
370
|
// Special tokens
|
300
371
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
301
372
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
302
373
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
303
374
|
|
375
|
+
// Grammar
|
376
|
+
//
|
377
|
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
378
|
+
const llama_grammar_element ** rules,
|
379
|
+
size_t n_rules,
|
380
|
+
size_t start_rule_index);
|
381
|
+
|
382
|
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
383
|
+
|
304
384
|
// Sampling functions
|
305
385
|
|
306
386
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
@@ -313,13 +393,11 @@ extern "C" {
|
|
313
393
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
314
394
|
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
315
395
|
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
316
|
-
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
317
396
|
LLAMA_API void llama_sample_classifier_free_guidance(
|
318
397
|
struct llama_context * ctx,
|
319
398
|
llama_token_data_array * candidates,
|
320
399
|
struct llama_context * guidance_ctx,
|
321
|
-
float scale
|
322
|
-
float smooth_factor);
|
400
|
+
float scale);
|
323
401
|
|
324
402
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
325
403
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
@@ -337,6 +415,9 @@ extern "C" {
|
|
337
415
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
338
416
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
339
417
|
|
418
|
+
/// @details Apply constraints from grammar
|
419
|
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
420
|
+
|
340
421
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
341
422
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
342
423
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -358,6 +439,9 @@ extern "C" {
|
|
358
439
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
359
440
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
360
441
|
|
442
|
+
/// @details Accepts the sampled token into the grammar
|
443
|
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
444
|
+
|
361
445
|
// Performance information
|
362
446
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
363
447
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-1a94186'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
+
LLAMA_GRETYPE_END: Integer
|
30
|
+
LLAMA_GRETYPE_ALT: Integer
|
31
|
+
LLAMA_GRETYPE_RULE_REF: Integer
|
32
|
+
LLAMA_GRETYPE_CHAR: Integer
|
33
|
+
LLAMA_GRETYPE_CHAR_NOT: Integer
|
34
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
35
|
+
LLAMA_GRETYPE_CHAR_ALT: Integer
|
36
|
+
|
29
37
|
def self?.backend_init: (?numa: bool) -> void
|
30
38
|
def self?.backend_free: () -> void
|
31
39
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -39,6 +47,7 @@ module LLaMACpp
|
|
39
47
|
def self?.token_nl: () -> Integer
|
40
48
|
def self?.mmap_supported?: () -> bool
|
41
49
|
def self?.mlock_supported?: () -> bool
|
50
|
+
def self?.max_devices: () -> Integer
|
42
51
|
|
43
52
|
class TokenData
|
44
53
|
public
|
@@ -69,6 +78,12 @@ module LLaMACpp
|
|
69
78
|
def free: () -> void
|
70
79
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
71
80
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
81
|
+
def n_vocab: () -> Integer
|
82
|
+
def n_ctx: () -> Integer
|
83
|
+
def n_embd: () -> Integer
|
84
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
85
|
+
def token_to_str: (Integer) -> String
|
86
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
72
87
|
end
|
73
88
|
|
74
89
|
class Timings
|
@@ -109,7 +124,7 @@ module LLaMACpp
|
|
109
124
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
110
125
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
111
126
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
112
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float
|
127
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
113
128
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
114
129
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
115
130
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -120,6 +135,8 @@ module LLaMACpp
|
|
120
135
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
121
136
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
122
137
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
138
|
+
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
139
|
+
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
123
140
|
end
|
124
141
|
|
125
142
|
class ContextParams
|
@@ -140,6 +157,10 @@ module LLaMACpp
|
|
140
157
|
def main_gpu: () -> Integer
|
141
158
|
def main_gpu=: (Integer) -> Integer
|
142
159
|
def tensor_split: () -> Array[Float]
|
160
|
+
def rope_freq_base=: (Float) -> Float
|
161
|
+
def rope_freq_base: () -> Float
|
162
|
+
def rope_freq_scale=: (Float) -> Float
|
163
|
+
def rope_freq_scale: () -> Float
|
143
164
|
def low_vram: () -> bool
|
144
165
|
def low_vram=: (bool) -> bool
|
145
166
|
def seed: () -> Integer
|
@@ -166,4 +187,18 @@ module LLaMACpp
|
|
166
187
|
end
|
167
188
|
|
168
189
|
class Params = ContextParams
|
190
|
+
|
191
|
+
class GrammarElement
|
192
|
+
public
|
193
|
+
|
194
|
+
def initialize: (?type: Integer, ?value: Integer) -> void
|
195
|
+
def type: () -> Integer
|
196
|
+
def type=: (Integer) -> Integer
|
197
|
+
def value: () -> Integer
|
198
|
+
def value=: (Integer) -> Integer
|
199
|
+
end
|
200
|
+
|
201
|
+
class Grammar
|
202
|
+
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
203
|
+
end
|
169
204
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|