llama_cpp 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -53,6 +53,10 @@
|
|
53
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
54
|
#endif
|
55
55
|
|
56
|
+
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
+
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
+
#endif
|
59
|
+
|
56
60
|
#ifdef __cplusplus
|
57
61
|
extern "C" {
|
58
62
|
#endif
|
@@ -83,11 +87,13 @@ extern "C" {
|
|
83
87
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
84
88
|
|
85
89
|
struct llama_context_params {
|
86
|
-
uint32_t seed;
|
87
|
-
int32_t n_ctx;
|
88
|
-
int32_t n_batch;
|
89
|
-
int32_t
|
90
|
-
|
90
|
+
uint32_t seed; // RNG seed, -1 for random
|
91
|
+
int32_t n_ctx; // text context
|
92
|
+
int32_t n_batch; // prompt processing batch size
|
93
|
+
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
94
|
+
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
95
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
96
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
97
|
|
92
98
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
99
|
|
@@ -140,6 +146,40 @@ extern "C" {
|
|
140
146
|
bool quantize_output_tensor; // quantize output.weight
|
141
147
|
} llama_model_quantize_params;
|
142
148
|
|
149
|
+
// grammar types
|
150
|
+
struct llama_grammar;
|
151
|
+
|
152
|
+
// grammar element type
|
153
|
+
enum llama_gretype {
|
154
|
+
// end of rule definition
|
155
|
+
LLAMA_GRETYPE_END = 0,
|
156
|
+
|
157
|
+
// start of alternate definition for rule
|
158
|
+
LLAMA_GRETYPE_ALT = 1,
|
159
|
+
|
160
|
+
// non-terminal element: reference to rule
|
161
|
+
LLAMA_GRETYPE_RULE_REF = 2,
|
162
|
+
|
163
|
+
// terminal element: character (code point)
|
164
|
+
LLAMA_GRETYPE_CHAR = 3,
|
165
|
+
|
166
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
167
|
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
168
|
+
|
169
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
170
|
+
// be an inclusive range ([a-z])
|
171
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
172
|
+
|
173
|
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
174
|
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
175
|
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
176
|
+
};
|
177
|
+
|
178
|
+
typedef struct llama_grammar_element {
|
179
|
+
enum llama_gretype type;
|
180
|
+
uint32_t value; // Unicode code point or rule ID
|
181
|
+
} llama_grammar_element;
|
182
|
+
|
143
183
|
// performance timing information
|
144
184
|
struct llama_timings {
|
145
185
|
double t_start_ms;
|
@@ -332,6 +372,15 @@ extern "C" {
|
|
332
372
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
333
373
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
334
374
|
|
375
|
+
// Grammar
|
376
|
+
//
|
377
|
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
378
|
+
const llama_grammar_element ** rules,
|
379
|
+
size_t n_rules,
|
380
|
+
size_t start_rule_index);
|
381
|
+
|
382
|
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
383
|
+
|
335
384
|
// Sampling functions
|
336
385
|
|
337
386
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
@@ -366,6 +415,9 @@ extern "C" {
|
|
366
415
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
367
416
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
368
417
|
|
418
|
+
/// @details Apply constraints from grammar
|
419
|
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
420
|
+
|
369
421
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
370
422
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
371
423
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -387,6 +439,9 @@ extern "C" {
|
|
387
439
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
388
440
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
389
441
|
|
442
|
+
/// @details Accepts the sampled token into the grammar
|
443
|
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
444
|
+
|
390
445
|
// Performance information
|
391
446
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
392
447
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-1a94186'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
+
LLAMA_GRETYPE_END: Integer
|
30
|
+
LLAMA_GRETYPE_ALT: Integer
|
31
|
+
LLAMA_GRETYPE_RULE_REF: Integer
|
32
|
+
LLAMA_GRETYPE_CHAR: Integer
|
33
|
+
LLAMA_GRETYPE_CHAR_NOT: Integer
|
34
|
+
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
35
|
+
LLAMA_GRETYPE_CHAR_ALT: Integer
|
36
|
+
|
29
37
|
def self?.backend_init: (?numa: bool) -> void
|
30
38
|
def self?.backend_free: () -> void
|
31
39
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
@@ -127,6 +135,8 @@ module LLaMACpp
|
|
127
135
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
128
136
|
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
129
137
|
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
138
|
+
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
139
|
+
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
130
140
|
end
|
131
141
|
|
132
142
|
class ContextParams
|
@@ -177,4 +187,18 @@ module LLaMACpp
|
|
177
187
|
end
|
178
188
|
|
179
189
|
class Params = ContextParams
|
190
|
+
|
191
|
+
class GrammarElement
|
192
|
+
public
|
193
|
+
|
194
|
+
def initialize: (?type: Integer, ?value: Integer) -> void
|
195
|
+
def type: () -> Integer
|
196
|
+
def type=: (Integer) -> Integer
|
197
|
+
def value: () -> Integer
|
198
|
+
def value=: (Integer) -> Integer
|
199
|
+
end
|
200
|
+
|
201
|
+
class Grammar
|
202
|
+
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
203
|
+
end
|
180
204
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|