llama_cpp 0.3.4 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +315 -8
- data/ext/llama_cpp/src/ggml-alloc.c +541 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +218 -87
- data/ext/llama_cpp/src/ggml-metal.metal +72 -55
- data/ext/llama_cpp/src/ggml.c +754 -996
- data/ext/llama_cpp/src/ggml.h +94 -18
- data/ext/llama_cpp/src/k_quants.c +350 -24
- data/ext/llama_cpp/src/llama.cpp +713 -179
- data/ext/llama_cpp/src/llama.h +61 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +26 -0
- metadata +4 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -53,6 +53,10 @@
|
|
53
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
54
|
#endif
|
55
55
|
|
56
|
+
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
+
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
+
#endif
|
59
|
+
|
56
60
|
#ifdef __cplusplus
|
57
61
|
extern "C" {
|
58
62
|
#endif
|
@@ -83,11 +87,13 @@ extern "C" {
|
|
83
87
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
84
88
|
|
85
89
|
struct llama_context_params {
|
86
|
-
uint32_t seed;
|
87
|
-
int32_t n_ctx;
|
88
|
-
int32_t n_batch;
|
89
|
-
int32_t
|
90
|
-
|
90
|
+
uint32_t seed; // RNG seed, -1 for random
|
91
|
+
int32_t n_ctx; // text context
|
92
|
+
int32_t n_batch; // prompt processing batch size
|
93
|
+
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
94
|
+
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
95
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
96
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
97
|
|
92
98
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
99
|
|
@@ -102,6 +108,7 @@ extern "C" {
|
|
102
108
|
|
103
109
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
104
110
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
111
|
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
105
112
|
bool f16_kv; // use fp16 for KV cache
|
106
113
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
107
114
|
bool vocab_only; // only load the vocabulary, no weights
|
@@ -140,6 +147,40 @@ extern "C" {
|
|
140
147
|
bool quantize_output_tensor; // quantize output.weight
|
141
148
|
} llama_model_quantize_params;
|
142
149
|
|
150
|
+
// grammar types
|
151
|
+
struct llama_grammar;
|
152
|
+
|
153
|
+
// grammar element type
|
154
|
+
enum llama_gretype {
|
155
|
+
// end of rule definition
|
156
|
+
LLAMA_GRETYPE_END = 0,
|
157
|
+
|
158
|
+
// start of alternate definition for rule
|
159
|
+
LLAMA_GRETYPE_ALT = 1,
|
160
|
+
|
161
|
+
// non-terminal element: reference to rule
|
162
|
+
LLAMA_GRETYPE_RULE_REF = 2,
|
163
|
+
|
164
|
+
// terminal element: character (code point)
|
165
|
+
LLAMA_GRETYPE_CHAR = 3,
|
166
|
+
|
167
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
168
|
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
169
|
+
|
170
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
171
|
+
// be an inclusive range ([a-z])
|
172
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
173
|
+
|
174
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
175
|
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
176
|
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
177
|
+
};
|
178
|
+
|
179
|
+
typedef struct llama_grammar_element {
|
180
|
+
enum llama_gretype type;
|
181
|
+
uint32_t value; // Unicode code point or rule ID
|
182
|
+
} llama_grammar_element;
|
183
|
+
|
143
184
|
// performance timing information
|
144
185
|
struct llama_timings {
|
145
186
|
double t_start_ms;
|
@@ -332,6 +373,15 @@ extern "C" {
|
|
332
373
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
333
374
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
334
375
|
|
376
|
+
// Grammar
|
377
|
+
//
|
378
|
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
379
|
+
const llama_grammar_element ** rules,
|
380
|
+
size_t n_rules,
|
381
|
+
size_t start_rule_index);
|
382
|
+
|
383
|
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
384
|
+
|
335
385
|
// Sampling functions
|
336
386
|
|
337
387
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
@@ -366,6 +416,9 @@ extern "C" {
|
|
366
416
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
367
417
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
368
418
|
|
419
|
+
/// @details Apply constraints from grammar
|
420
|
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
421
|
+
|
369
422
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
370
423
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
371
424
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -387,6 +440,9 @@ extern "C" {
|
|
387
440
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
388
441
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
389
442
|
|
443
|
+
/// @details Accepts the sampled token into the grammar
|
444
|
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
445
|
+
|
390
446
|
// Performance information
|
391
447
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
392
448
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.6'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-468ea24'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
+
LLAMA_GRETYPE_END: Integer
|
30
|
+
LLAMA_GRETYPE_ALT: Integer
|
31
|
+
LLAMA_GRETYPE_RULE_REF: Integer
|
32
|
+
LLAMA_GRETYPE_CHAR: Integer
|
33
|
+
LLAMA_GRETYPE_CHAR_NOT: Integer
|
34
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
35
|
+
LLAMA_GRETYPE_CHAR_ALT: Integer
|
36
|
+
|
29
37
|
def self?.backend_init: (?numa: bool) -> void
|
30
38
|
def self?.backend_free: () -> void
|
31
39
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -127,6 +135,8 @@ module LLaMACpp
|
|
127
135
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
128
136
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
129
137
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
138
|
+
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
139
|
+
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
130
140
|
end
|
131
141
|
|
132
142
|
class ContextParams
|
@@ -153,6 +163,8 @@ module LLaMACpp
|
|
153
163
|
def rope_freq_scale: () -> Float
|
154
164
|
def low_vram: () -> bool
|
155
165
|
def low_vram=: (bool) -> bool
|
166
|
+
def mul_mat_q: () -> bool
|
167
|
+
def mul_mat_q=: (bool) -> bool
|
156
168
|
def seed: () -> Integer
|
157
169
|
def seed=: (Integer) -> Integer
|
158
170
|
def use_mlock: () -> bool
|
@@ -177,4 +189,18 @@ module LLaMACpp
|
|
177
189
|
end
|
178
190
|
|
179
191
|
class Params = ContextParams
|
192
|
+
|
193
|
+
class GrammarElement
|
194
|
+
public
|
195
|
+
|
196
|
+
def initialize: (?type: Integer, ?value: Integer) -> void
|
197
|
+
def type: () -> Integer
|
198
|
+
def type=: (Integer) -> Integer
|
199
|
+
def value: () -> Integer
|
200
|
+
def value=: (Integer) -> Integer
|
201
|
+
end
|
202
|
+
|
203
|
+
class Grammar
|
204
|
+
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
205
|
+
end
|
180
206
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -30,6 +30,8 @@ files:
|
|
30
30
|
- ext/llama_cpp/llama_cpp.cpp
|
31
31
|
- ext/llama_cpp/llama_cpp.h
|
32
32
|
- ext/llama_cpp/src/LICENSE
|
33
|
+
- ext/llama_cpp/src/ggml-alloc.c
|
34
|
+
- ext/llama_cpp/src/ggml-alloc.h
|
33
35
|
- ext/llama_cpp/src/ggml-cuda.cu
|
34
36
|
- ext/llama_cpp/src/ggml-cuda.h
|
35
37
|
- ext/llama_cpp/src/ggml-metal.h
|