llama_cpp 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -53,6 +53,10 @@
|
|
53
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
54
|
#endif
|
55
55
|
|
56
|
+
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
+
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
+
#endif
|
59
|
+
|
56
60
|
#ifdef __cplusplus
|
57
61
|
extern "C" {
|
58
62
|
#endif
|
@@ -83,11 +87,13 @@ extern "C" {
|
|
83
87
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
84
88
|
|
85
89
|
struct llama_context_params {
|
86
|
-
uint32_t seed;
|
87
|
-
int32_t n_ctx;
|
88
|
-
int32_t n_batch;
|
89
|
-
int32_t
|
90
|
-
|
90
|
+
uint32_t seed; // RNG seed, -1 for random
|
91
|
+
int32_t n_ctx; // text context
|
92
|
+
int32_t n_batch; // prompt processing batch size
|
93
|
+
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
94
|
+
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
95
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
96
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
97
|
|
92
98
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
99
|
|
@@ -140,6 +146,40 @@ extern "C" {
|
|
140
146
|
bool quantize_output_tensor; // quantize output.weight
|
141
147
|
} llama_model_quantize_params;
|
142
148
|
|
149
|
+
// grammar types
|
150
|
+
struct llama_grammar;
|
151
|
+
|
152
|
+
// grammar element type
|
153
|
+
enum llama_gretype {
|
154
|
+
// end of rule definition
|
155
|
+
LLAMA_GRETYPE_END = 0,
|
156
|
+
|
157
|
+
// start of alternate definition for rule
|
158
|
+
LLAMA_GRETYPE_ALT = 1,
|
159
|
+
|
160
|
+
// non-terminal element: reference to rule
|
161
|
+
LLAMA_GRETYPE_RULE_REF = 2,
|
162
|
+
|
163
|
+
// terminal element: character (code point)
|
164
|
+
LLAMA_GRETYPE_CHAR = 3,
|
165
|
+
|
166
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
167
|
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
168
|
+
|
169
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
170
|
+
// be an inclusive range ([a-z])
|
171
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
172
|
+
|
173
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
174
|
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
175
|
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
176
|
+
};
|
177
|
+
|
178
|
+
typedef struct llama_grammar_element {
|
179
|
+
enum llama_gretype type;
|
180
|
+
uint32_t value; // Unicode code point or rule ID
|
181
|
+
} llama_grammar_element;
|
182
|
+
|
143
183
|
// performance timing information
|
144
184
|
struct llama_timings {
|
145
185
|
double t_start_ms;
|
@@ -332,6 +372,15 @@ extern "C" {
|
|
332
372
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
333
373
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
334
374
|
|
375
|
+
// Grammar
|
376
|
+
//
|
377
|
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
378
|
+
const llama_grammar_element ** rules,
|
379
|
+
size_t n_rules,
|
380
|
+
size_t start_rule_index);
|
381
|
+
|
382
|
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
383
|
+
|
335
384
|
// Sampling functions
|
336
385
|
|
337
386
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
@@ -366,6 +415,9 @@ extern "C" {
|
|
366
415
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
367
416
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
368
417
|
|
418
|
+
/// @details Apply constraints from grammar
|
419
|
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
420
|
+
|
369
421
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
370
422
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
371
423
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -387,6 +439,9 @@ extern "C" {
|
|
387
439
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
388
440
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
389
441
|
|
442
|
+
/// @details Accepts the sampled token into the grammar
|
443
|
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
444
|
+
|
390
445
|
// Performance information
|
391
446
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
392
447
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-1a94186'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
+
LLAMA_GRETYPE_END: Integer
|
30
|
+
LLAMA_GRETYPE_ALT: Integer
|
31
|
+
LLAMA_GRETYPE_RULE_REF: Integer
|
32
|
+
LLAMA_GRETYPE_CHAR: Integer
|
33
|
+
LLAMA_GRETYPE_CHAR_NOT: Integer
|
34
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
35
|
+
LLAMA_GRETYPE_CHAR_ALT: Integer
|
36
|
+
|
29
37
|
def self?.backend_init: (?numa: bool) -> void
|
30
38
|
def self?.backend_free: () -> void
|
31
39
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -127,6 +135,8 @@ module LLaMACpp
|
|
127
135
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
128
136
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
129
137
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
138
|
+
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
139
|
+
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
130
140
|
end
|
131
141
|
|
132
142
|
class ContextParams
|
@@ -177,4 +187,18 @@ module LLaMACpp
|
|
177
187
|
end
|
178
188
|
|
179
189
|
class Params = ContextParams
|
190
|
+
|
191
|
+
class GrammarElement
|
192
|
+
public
|
193
|
+
|
194
|
+
def initialize: (?type: Integer, ?value: Integer) -> void
|
195
|
+
def type: () -> Integer
|
196
|
+
def type=: (Integer) -> Integer
|
197
|
+
def value: () -> Integer
|
198
|
+
def value=: (Integer) -> Integer
|
199
|
+
end
|
200
|
+
|
201
|
+
class Grammar
|
202
|
+
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
203
|
+
end
|
180
204
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|