llama_cpp 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -53,6 +53,10 @@
|
|
53
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
54
|
#endif
|
55
55
|
|
56
|
+
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
+
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
+
#endif
|
59
|
+
|
56
60
|
#ifdef __cplusplus
|
57
61
|
extern "C" {
|
58
62
|
#endif
|
@@ -83,11 +87,13 @@ extern "C" {
|
|
83
87
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
84
88
|
|
85
89
|
struct llama_context_params {
|
86
|
-
uint32_t seed;
|
87
|
-
int32_t n_ctx;
|
88
|
-
int32_t n_batch;
|
89
|
-
int32_t
|
90
|
-
|
90
|
+
uint32_t seed; // RNG seed, -1 for random
|
91
|
+
int32_t n_ctx; // text context
|
92
|
+
int32_t n_batch; // prompt processing batch size
|
93
|
+
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
94
|
+
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
95
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
96
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
97
|
|
92
98
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
99
|
|
@@ -102,6 +108,7 @@ extern "C" {
|
|
102
108
|
|
103
109
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
104
110
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
111
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
105
112
|
bool f16_kv; // use fp16 for KV cache
|
106
113
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
107
114
|
bool vocab_only; // only load the vocabulary, no weights
|
@@ -140,6 +147,40 @@ extern "C" {
|
|
140
147
|
bool quantize_output_tensor; // quantize output.weight
|
141
148
|
} llama_model_quantize_params;
|
142
149
|
|
150
|
+
// grammar types
|
151
|
+
struct llama_grammar;
|
152
|
+
|
153
|
+
// grammar element type
|
154
|
+
enum llama_gretype {
|
155
|
+
// end of rule definition
|
156
|
+
LLAMA_GRETYPE_END = 0,
|
157
|
+
|
158
|
+
// start of alternate definition for rule
|
159
|
+
LLAMA_GRETYPE_ALT = 1,
|
160
|
+
|
161
|
+
// non-terminal element: reference to rule
|
162
|
+
LLAMA_GRETYPE_RULE_REF = 2,
|
163
|
+
|
164
|
+
// terminal element: character (code point)
|
165
|
+
LLAMA_GRETYPE_CHAR = 3,
|
166
|
+
|
167
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
168
|
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
169
|
+
|
170
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
171
|
+
// be an inclusive range ([a-z])
|
172
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
173
|
+
|
174
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
175
|
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
176
|
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
177
|
+
};
|
178
|
+
|
179
|
+
typedef struct llama_grammar_element {
|
180
|
+
enum llama_gretype type;
|
181
|
+
uint32_t value; // Unicode code point or rule ID
|
182
|
+
} llama_grammar_element;
|
183
|
+
|
143
184
|
// performance timing information
|
144
185
|
struct llama_timings {
|
145
186
|
double t_start_ms;
|
@@ -332,6 +373,15 @@ extern "C" {
|
|
332
373
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
333
374
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
334
375
|
|
376
|
+
// Grammar
|
377
|
+
//
|
378
|
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
379
|
+
const llama_grammar_element ** rules,
|
380
|
+
size_t n_rules,
|
381
|
+
size_t start_rule_index);
|
382
|
+
|
383
|
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
384
|
+
|
335
385
|
// Sampling functions
|
336
386
|
|
337
387
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
@@ -366,6 +416,9 @@ extern "C" {
|
|
366
416
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
367
417
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
368
418
|
|
419
|
+
/// @details Apply constraints from grammar
|
420
|
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
421
|
+
|
369
422
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
370
423
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
371
424
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -387,6 +440,9 @@ extern "C" {
|
|
387
440
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
388
441
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
389
442
|
|
443
|
+
/// @details Accepts the sampled token into the grammar
|
444
|
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
445
|
+
|
390
446
|
// Performance information
|
391
447
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
392
448
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-468ea24'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
+
LLAMA_GRETYPE_END: Integer
|
30
|
+
LLAMA_GRETYPE_ALT: Integer
|
31
|
+
LLAMA_GRETYPE_RULE_REF: Integer
|
32
|
+
LLAMA_GRETYPE_CHAR: Integer
|
33
|
+
LLAMA_GRETYPE_CHAR_NOT: Integer
|
34
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
35
|
+
LLAMA_GRETYPE_CHAR_ALT: Integer
|
36
|
+
|
29
37
|
def self?.backend_init: (?numa: bool) -> void
|
30
38
|
def self?.backend_free: () -> void
|
31
39
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -127,6 +135,8 @@ module LLaMACpp
|
|
127
135
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
128
136
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
129
137
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
138
|
+
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
139
|
+
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
130
140
|
end
|
131
141
|
|
132
142
|
class ContextParams
|
@@ -153,6 +163,8 @@ module LLaMACpp
|
|
153
163
|
def rope_freq_scale: () -> Float
|
154
164
|
def low_vram: () -> bool
|
155
165
|
def low_vram=: (bool) -> bool
|
166
|
+
def mul_mat_q: () -> bool
|
167
|
+
def mul_mat_q=: (bool) -> bool
|
156
168
|
def seed: () -> Integer
|
157
169
|
def seed=: (Integer) -> Integer
|
158
170
|
def use_mlock: () -> bool
|
@@ -177,4 +189,18 @@ module LLaMACpp
|
|
177
189
|
end
|
178
190
|
|
179
191
|
class Params = ContextParams
|
192
|
+
|
193
|
+
class GrammarElement
|
194
|
+
public
|
195
|
+
|
196
|
+
def initialize: (?type: Integer, ?value: Integer) -> void
|
197
|
+
def type: () -> Integer
|
198
|
+
def type=: (Integer) -> Integer
|
199
|
+
def value: () -> Integer
|
200
|
+
def value=: (Integer) -> Integer
|
201
|
+
end
|
202
|
+
|
203
|
+
class Grammar
|
204
|
+
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
205
|
+
end
|
180
206
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -30,6 +30,8 @@ files:
|
|
30
30
|
- ext/llama_cpp/llama_cpp.cpp
|
31
31
|
- ext/llama_cpp/llama_cpp.h
|
32
32
|
- ext/llama_cpp/src/LICENSE
|
33
|
+
- ext/llama_cpp/src/ggml-alloc.c
|
34
|
+
- ext/llama_cpp/src/ggml-alloc.h
|
33
35
|
- ext/llama_cpp/src/ggml-cuda.cu
|
34
36
|
- ext/llama_cpp/src/ggml-cuda.h
|
35
37
|
- ext/llama_cpp/src/ggml-metal.h
|